This script includes the complete workflow how the data for the following publication was analysed:

Functional strain redundancy and persistent phage infection in a Swiss hard cheese starter culture

The scripts are divided into three subscripts. Each on is saved and compiled as a Rmarkdown with extensive comments. The three parts consist of:

  1. the general analysis including all genomic and non-genomic data-analysis
  2. the plotting and analysis for the main body figures
  3. the plotting and analysis for the supplemental figures

Here, we look at the second part: Main Body Figures

1 Main Body Figures

Here, I go through the creation of the different plots for the Figures

1.1 Figure 1

Figure 1. Assembly, annotation, and functional properties of the two metagenome-assembled genomes (MAGs) from the Swiss hard cheese starter culture RMK202. A) The Metagenome-assembled-genomes of S. thermophilus and L. delbrueckii with different genetic features highlighted (see legend). B) Functional properties potentially involved in the metabolic interaction of the two species. Filled red circles indicate presence, while empty circles indicate absence.

Figure 1. Assembly, annotation, and functional properties of the two metagenome-assembled genomes (MAGs) from the Swiss hard cheese starter culture RMK202. A) The Metagenome-assembled-genomes of S. thermophilus and L. delbrueckii with different genetic features highlighted (see legend). B) Functional properties potentially involved in the metabolic interaction of the two species. Filled red circles indicate presence, while empty circles indicate absence.

1.1.1 Circos plot

Here, I creat the circos plots for the bacteria. I will include the following information: 1. genome (circul) 2. forward genes
3. reverse genes 2.2. tRNA and rRNA colored 3.2. tRNA and rRNA colored 4. Pseudogenes 5. Prophage location 6. GC-skew (https://dbsloan.github.io/TS2019/exercises/circos.html#add-gc-skew-data-to-the-plot)


##==================
##Sterm
##==================
home=/home/vincent/bin/apps/circos-0.69-6/
cd $home

##organisational 
#cp /home/vincent/bin/apps/circos-0.69-6/etc/ticks_nwc_1_ldel_both.conf /home/vincent/bin/apps/circos-0.69-6/etc/ticks_mag_both.conf

#cp /home/vincent/bin/apps/circos-0.69-6/etc/ideogram.conf /home/vincent/bin/apps/circos-0.69-6/etc/ideogram_mag.conf
mkdir -p $home/data/rmk202/MAG_rmk202_sterm/
##---------------------karyotyp
#seqlength.py "/home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202 (2).fasta" |head -1 > ${home}/data/karyotype/sterm_magg_rmk202.txt
##have to change the format a bit

##---------------------genes
cd /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm
/usr/bin/perl /home/vincent/miniconda3/bin/bp_genbank2gff3.pl S_thermophilus_RMK202.current.gb
cd $home
grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046134" && $3=="CDS" && $7=="+") print "S_thermophilus_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_sterm/genes_forward.txt
grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046134" && $3=="CDS" && $7=="-") print "S_thermophilus_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_sterm/genes_reverse.txt

##---------------------rRNA

grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046134" && $3=="rRNA" && $7=="+") print "S_thermophilus_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_sterm/rRNA_forward.txt
grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046134" && $3=="rRNA" && $7=="-") print "S_thermophilus_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_sterm/rRNA_reverse.txt

##---------------------tRNA

grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046134" && $3=="tRNA" && $7=="+") print "S_thermophilus_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_sterm/tRNA_forward.txt
grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046134" && $3=="tRNA" && $7=="-") print "S_thermophilus_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_sterm/tRNA_reverse.txt

##---------------------transposase from PGAP

grep "transposase" -i /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046134") print "S_thermophilus_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_sterm/transposase.txt

##---------------------pseudogenes

grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046134" && $3=="pseudogene" ) print "S_thermophilus_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_sterm/pseudogenes.txt
#grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046134" && $3=="tRNA" && $7=="-") print "S_thermophilus_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_sterm/tRNA_reverse.txt



##---------------------prophages
awk -F "\t" '{OFS="\t"}{if($1=="CP046134") print "S_thermophilus_mag_rmk202",$4,$5}'  /home/vincent/Desktop/Projects/2019_RMK202_analysis/phage_annotation/phaster/ZZ_be76e5e924.PHASTER/prophage_summary_onlyGenome.gff > $home/data/rmk202/MAG_rmk202_sterm/prophages.txt

##---------------------protease

#only in l. delbrueckii 
#I checked online on NCBI if the genomes contain PrtS or PrtB

#/home/vincent/Downloads/prtS_Sterm.fasta

grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |grep "S8 family serine peptidase" |awk -F "\t" '{OFS="\t"}{if($1=="CP046134") print "S_thermophilus_mag_rmk202",$4,$5}'> $home/data/rmk202/MAG_rmk202_sterm/protease.txt
##---------------------transporter


grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |grep "transporter" |awk -F "\t" '{OFS="\t"}{if($1=="CP046134") print "S_thermophilus_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_sterm/transporter.txt


##---------------------CRISPR arrays

#grep "SUMMARY BY POSITION" -A 50 $home/data/rmk202/MAG_rmk202_sterm/S_thermophilus_RMK202.pilarCR_out|grep "^====" -A 50 | sed '1d' | sed 's/^ *//g'| sed 's/ \{1,\}/\t/g'|awk -F "\t" '{OFS="\t"}{print "S_thermophilus_mag_rmk202",$3,$3+$4,"fill_color=blue"}' >  $home/data/rmk202/MAG_rmk202_sterm/CRISPR.txt
##--cas genes

#grep "Cas" /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff
grep "CRISPR" /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff  |awk -F "\t" '{OFS="\t"}{if($1=="CP046134" && $3=="CDS" ) print "S_thermophilus_mag_rmk202",$4,$5,"color=chr3"}' >  $home/data/rmk202/MAG_rmk202_sterm/CRISPR.txt

grep "CRISPR" /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff  |awk -F "\t" '{OFS="\t"}{if($1=="CP046134" && $3=="repeat_region" ) print "S_thermophilus_mag_rmk202",$4,$5,"color=chr2"}' >>  $home/data/rmk202/MAG_rmk202_sterm/CRISPR.txt

##---------------------gc skew

GCcalc.py -f "/home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202 (2).fasta" > $home/data/rmk202/MAG_rmk202_sterm/gc_skew_sterm.txt
awk -F "\t" '{OFS="\t"}{if($5>0) print $1,$2,$2,$5,"fill_color=blue" ; else print $1,$2,$2,$5,"fill_color=orange"}' $home/data/rmk202/MAG_rmk202_sterm/gc_skew_sterm.txt > 
$home/data/rmk202/MAG_rmk202_sterm/gc_skew_sterm_cleaned.txt

##example
cp etc/repeat_nwc1_sterm_01.conf etc/sterm_rmk202_mag_circos.conf

#example run
bin/circos -conf etc/sterm_rmk202_mag_circos.conf -outputfile ./Sterm_mag_rmk202.svg; firefox ./Sterm_mag_rmk202.svg &


##==================
##Ldel
##==================
home=/home/vincent/bin/apps/circos-0.69-6/
cd $home

##organisational 
#cp /home/vincent/bin/apps/circos-0.69-6/etc/ticks_nwc_1_ldel_both.conf /home/vincent/bin/apps/circos-0.69-6/etc/ticks_mag_both.conf ##already done
#cp /home/vincent/bin/apps/circos-0.69-6/etc/ideogram.conf /home/vincent/bin/apps/circos-0.69-6/etc/ideogram_mag.conf ##already done
#cp /home/vincent/bin/apps/circos-0.69-6/etc/sterm_rmk202_mag_circos.conf /home/vincent/bin/apps/circos-0.69-6/etc/ldel_rmk202_mag_circos.conf
mkdir -p $home/data/rmk202/MAG_rmk202_ldel/
##---------------------karyotyp

#seqlength.py /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.fasta |head -1 > ${home}/data/karyotype/ldel_magg_rmk202.txt


##have to change the format a bit

##---------------------genes
cd /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel
/usr/bin/perl /home/vincent/miniconda3/bin/bp_genbank2gff3.pl L_delbrueckii_RMK202.current.gb
cd $home
grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046131" && $3=="CDS" && $7=="+") print "L_delbrueckii_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_ldel/genes_forward.txt
grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046131" && $3=="CDS" && $7=="-") print "L_delbrueckii_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_ldel/genes_reverse.txt

##---------------------rRNA

grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046131" && $3=="rRNA" && $7=="+") print "L_delbrueckii_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_ldel/rRNA_forward.txt
grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046131" && $3=="rRNA" && $7=="-") print "L_delbrueckii_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_ldel/rRNA_reverse.txt


##---------------------transposase from PGAP

grep "transposase" -i /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046131") print "L_delbrueckii_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_ldel/transposase.txt

##---------------------pseudogenes

grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046131" && $3=="pseudogene" ) print "L_delbrueckii_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_ldel/pseudogenes.txt
#grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff |awk -F "\t" '{OFS="\t"}{if($1=="CP046134" && $3=="tRNA" && $7=="-") print "L_delbrueckii_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_ldel/tRNA_reverse.txt



##---------------------prophages
awk -F "\t" '{OFS="\t"}{if($1=="CP046134") print "L_delbrueckii_mag_rmk202",$4,$5}'  /home/vincent/Desktop/Projects/2019_RMK202_analysis/phage_annotation/phaster/ZZ_be76e5e924.PHASTER/prophage_summary_onlyGenome.gff > $home/data/rmk202/MAG_rmk202_ldel/prophages.txt
##---------------------CRISPR arrays


#grep "Cas" /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff
grep "CRISPR" /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff  |awk -F "\t" '{OFS="\t"}{if($1=="CP046131" && $3=="CDS" ) print "L_delbrueckii_mag_rmk202",$4,$5,"color=chr3"}' >  $home/data/rmk202/MAG_rmk202_ldel/CRISPR.txt

grep "CRISPR" /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff  |awk -F "\t" '{OFS="\t"}{if($1=="CP046131" && $3=="repeat_region" ) print "L_delbrueckii_mag_rmk202",$4,$5,"color=chr2"}' >>  $home/data/rmk202/MAG_rmk202_ldel/CRISPR.txt


##---------------------protease

#only in l. delbrueckii 
#I checked online on NCBI if the genomes contain PrtS or PrtB

#/home/vincent/Downloads/prtS_Sterm.fasta

grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff |grep "S8 family serine peptidase" |awk -F "\t" '{OFS="\t"}{if($1=="CP046131") print "L_delbrueckii_mag_rmk202",$4,$5}'> $home/data/rmk202/MAG_rmk202_ldel/protease.txt
##---------------------transporter


grep "^#" -v /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff |grep "transporter" |awk -F "\t" '{OFS="\t"}{if($1=="CP046131") print "L_delbrueckii_mag_rmk202",$4,$5}' > $home/data/rmk202/MAG_rmk202_ldel/transporter.txt



##---------------------gc skew

GCcalc.py -f /home/vincent/Desktop/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.fasta > $home/data/rmk202/MAG_rmk202_ldel/gc_skew_ldel.txt
awk -F "\t" '{OFS="\t"}{if($5>0) print $1,$2,$2,$5,"fill_color=blue" ; else print $1,$2,$2,$5,"fill_color=orange"}' $home/data/rmk202/MAG_rmk202_ldel/gc_skew_ldel.txt > $home/data/rmk202/MAG_rmk202_ldel/gc_skew_ldel_cleaned.txt


##example
#cp etc/repeat_nwc1_sterm_01.conf etc/ldel_rmk202_mag_circos.conf

#example run
bin/circos -conf etc/ldel_rmk202_mag_circos.conf -outputfile ./ldel_mag_rmk202.svg; firefox ./ldel_mag_rmk202.svg &

1.1.2 POGENOM

This is the analysis done with POGENOM We do this to get population genomic insights such as dN/dS ratios. Important to note is that the gff file should not contain any fasta entries.

VCFFILE=/archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL.recode.vcf
Assembly=/archiv/Projects/2019_Nano_Meta/02_flye_Assembly/02_raw_demultiplexed/di_K2_6h/Polishing_FINAL/FINAL_assembly_withPlasmids_PGAP/RMK202_MAG_assembly.fasta


mkdir -p  /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PGAP/ForPOGENOM/

#grep "^202-LMAG-1" /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Ldel/202-LMAG/PROKKA_04012020.gff |sed 's/^202-LMAG-1/CP046131/g' > /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/bothSpecies_prepped.gff

#grep "^202-SMAG-1" /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Sterm/202-SMAG/PROKKA_04012020.gff |sed 's/^202-SMAG-1/CP046134/g' >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/bothSpecies_prepped.gff

grep "^#" -v /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Ldel/202-LMAG/PROKKA_04012020.gff |sed 's/^202-LMAG-1/CP046131/g' > /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/bothSpecies_prepped.gff

grep "^#" -v /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Sterm/202-SMAG/PROKKA_04012020.gff |sed 's/^202-SMAG-1/CP046134/g' >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/bothSpecies_prepped.gff


awk -F "\t" '{OFS="\t"}{if($3=="CDS")print $0}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/bothSpecies_prepped.gff > /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/bothSpecies_prepped_ready.gff

#GFF_file=/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/MAG_annotation_onlyBAC_mod.gff 


###--------------------------------
#RMK202
#Konserve_202
#Versand_202
#Lyo_202_2012
#lyo202_96
#Lyo_202_2014




#vcftools --vcf /archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL.recode.vcf --minQ 30 --remove-indels --recode --recode-INFO-all \
#  --keep /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/SamplesMeta.txt --out \
#  /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_onlyMeta

sed -e 's/\r/\n/g' /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/MAG_annotation_onlyBAC_mod.gff  >  /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/MAG_annotation_onlyBAC_mod_cleaned.gff

#GFF_file=/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/MAG_annotation_onlyBAC_mod_cleaned.gff
GFF_file=/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/bothSpecies_prepped_ready.gff


Assembly=/archiv/Projects/2019_Nano_Meta/02_flye_Assembly/02_raw_demultiplexed/di_K2_6h/Polishing_FINAL/FINAL_assembly_withPlasmids_PGAP/RMK202_MAG_assembly.fasta

VCFFILE=/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_onlyMeta.recode.vcf

grep "^#" -v ${GFF_file} |cut -f 1|sort|uniq -c
#tail ${GFF_file}
grep ">" ${Assembly}
grep "^#" -v ${VCFFILE} |cut -f 1|sort|uniq -c
##========================================
##Run POGENOME
##========================================


###-----------------
##Sterm
###-----------------
sed 's/^202-SMAG-1/CP046134/g' /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Sterm/S_I_202_SMAG/PROKKA_08282020.gff|sed 's/>202-SMAG-1/CP046134/g' > /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/bothSpecies_prepped_sterm_cleaned.gff
GFF_file_03=/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/bothSpecies_prepped_sterm_cleaned.gff

#genomes=S_M_202_SMAG
#echo "##gff-version 3" > /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes//FINAL_mix_typestrains_RMK202/gff/${genomes}_forPOGENOM.gff
#echo "##sequence-region 202-SMAG-1 1 1865439" >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes//FINAL_mix_typestrains_RMK202/gff/${genomes}_forPOGENOM.gff

#grep "^CP046134" /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes//FINAL_mix_typestrains_RMK202/gff/${genomes}.gff |awk -F "\t" '{OFS="\t"}{if($3=="gene")print $0}' |sed 's/ID=.*locus_tag=/ID=/g' |sed 's/gene/CDS/g'|sed 's/;.*$//g'| awk '{print $0"_gene"}' >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes//FINAL_mix_typestrains_RMK202/gff/${genomes}_forPOGENOM.gff
#GFF_file_03=/archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes//FINAL_mix_typestrains_RMK202/gff/${genomes}_forPOGENOM.gff
#echo "##FASTA" >> ${GFF_file_03}
#cat /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Sterm/S_I_202_SMAG/PROKKA_08282020.fna |sed 's/>202-SMAG-1/CP046134/g' >> ${GFF_file_03}
#samtools faidx /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes//FINAL_mix_typestrains_RMK202/fasta/${genomes}.fasta CP046134 > /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes//FINAL_mix_typestrains_RMK202/fasta/${genomes}_forPOGENOM.fasta


mkdir -p /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PGAP/ForPOGENOM/

vcftools --vcf /archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL.recode.vcf --chr CP046134 --minQ 30 --remove-indels --recode --recode-INFO-all \
  --keep /archiv/Projects/2019_RMK202_analysis/01_log/onlyMeta_samples_withEvolution.txt --out \
  /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PGAP/ForPOGENOM/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_onlyMeta_onlySterm

VCFFILE=/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PGAP/ForPOGENOM/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_onlyMeta_onlySterm.recode.vcf


rm -r /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/POGENOM_PGAP/bothGEnomes_allSamples/sterm/
mkdir -p /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/POGENOM_PGAP/bothGEnomes_allSamples/sterm/


perl /home/vincent/apps/POGENOM/POGENOM-0.8.1/pogenom.pl --vcf_file ${VCFFILE} --out /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/POGENOM_PGAP/bothGEnomes_allSamples/sterm/RunPogenome_all_rmk202_new --gff_file ${GFF_file_03}   --genetic_code_file /home/vincent/apps/POGENOM/POGENOM-0.8.1/bacterial_genetic_code_table11_ncbi.txt

#--fasta_file /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes//FINAL_mix_typestrains_RMK202/fasta/${genomes}_forPOGENOM.fasta 
#--genome_size 1865459
###-----------------
##Ldel
###-----------------
sed 's/^202-LMAG-1/CP046131/g' /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Ldel/L_I_202_LMAG/PROKKA_10022020.gff|sed 's/>202-LMAG-1/CP046131/g' > /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/bothSpecies_prepped_ldel_cleaned.gff
GFF_file_03=/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/ForPOGENOM/bothSpecies_prepped_ldel_cleaned.gff


#genomes=L_M_202_LMAG



#grep "^CP046131" /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes//FINAL_mix_typestrains_RMK202/gff/${genomes}.gff |awk -F "\t" '{OFS="\t"}{if($3=="gene")print $0}' |sed 's/ID=.*locus_tag=/ID=/g' |sed 's/gene/CDS/g'|sed 's/;.*$//g'| awk '{print $0"_gene"}' > /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes//FINAL_mix_typestrains_RMK202/gff/${genomes}_forPOGENOM.gff
#GFF_file_03=/archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes//FINAL_mix_typestrains_RMK202/gff/${genomes}_forPOGENOM.gff
#echo "##FASTA" >> ${GFF_file_03}
#samtools faidx /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes//FINAL_mix_typestrains_RMK202/fasta/${genomes}.fasta CP046131 |sed 's/>//g' >> ${GFF_file_03}


mkdir -p /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PGAP/ForPOGENOM/


vcftools --vcf /archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL.recode.vcf --chr CP046131 --minQ 30 --remove-indels --recode --recode-INFO-all \
  --keep /archiv/Projects/2019_RMK202_analysis/01_log/onlyMeta_samples_withEvolution.txt --out \
  /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PGAP/ForPOGENOM/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_onlyMeta_onlyLdel

VCFFILE=/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PGAP/ForPOGENOM/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_onlyMeta_onlyLdel.recode.vcf


rm -r /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/POGENOM_PGAP/bothGEnomes_allSamples/Ldel/
mkdir -p /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/POGENOM_PGAP/bothGEnomes_allSamples/Ldel/


perl /home/vincent/apps/POGENOM/POGENOM-0.8.1/pogenom.pl --vcf_file ${VCFFILE} --out /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/POGENOM_PGAP/bothGEnomes_allSamples/Ldel/RunPogenome_all_rmk202_new --gff_file ${GFF_file_03}   --genetic_code_file /home/vincent/apps/POGENOM/POGENOM-0.8.1/bacterial_genetic_code_table11_ncbi.txt

#--genome_size 2166765
library(readr)
library(ggplot2)
library(lubridate)
library(tidyverse)
Genes_of_Interest <- read_delim("../data_zenodo/non_genomic_data//all_interaction_functioning_Genes.txt",  "\t", escape_double = FALSE, col_names = c("species","gene","OG","numberGenomes"),  trim_ws = TRUE) %>% select(-numberGenomes)


OGs_Ldel <- read_delim("../data_zenodo/non_genomic_data/OGs_Ldel.txt", "\t", escape_double = FALSE, col_names = c("OG","Gene"), trim_ws = TRUE)
OGs_Sterm <- read_delim("../data_zenodo/non_genomic_data/OGs_Sterm.txt", "\t", escape_double = FALSE, col_names = c("OG","Gene"), trim_ws = TRUE)
Genes_of_Interest_Ldel <- Genes_of_Interest %>% filter(species=="Ldel")
Genes_of_Interest_Sterm <- Genes_of_Interest %>% filter(species=="Sterm")

GENES_Ldel <- merge(OGs_Ldel,Genes_of_Interest_Ldel,by="OG",all = TRUE)
# table(GENES_Ldel$gene)
GENES_Sterm <- merge(OGs_Sterm,Genes_of_Interest_Sterm,by="OG",all = TRUE)
# table(GENES_Sterm$gene)

table(GENES_Ldel_extended$gene)

RunPogenome_Ldel_pNpS_per_gene <- read_delim("../data_zenodo/non_genomic_data/RunPogenome_Ldel.pNpS-per-gene.txt",   "\t", escape_double = FALSE, trim_ws = TRUE) %>% select(-c(`RMK202 pNpS`,`Konserve_202 pNpS`,`Versand_202 pNpS`,`Lyo_202_2012 pNpS`,`lyo202_96 pNpS`,`Lyo_202_2014 pNpS`))
RunPogenome_Sterm_pNpS_per_gene <- read_delim("../data_zenodo/non_genomic_data/RunPogenome_Sterm.pNpS-per-gene.txt",   "\t", escape_double = FALSE, trim_ws = TRUE) %>% select(-c(`RMK202 pNpS`,`Konserve_202 pNpS`,`Versand_202 pNpS`,`Lyo_202_2012 pNpS`,`lyo202_96 pNpS`,`Lyo_202_2014 pNpS`))

ggplot(RunPogenome_Sterm_pNpS_per_gene,aes(x=`All_samples_combined pNpS`))+geom_density()
ggplot(RunPogenome_Ldel_pNpS_per_gene,aes(x=`All_samples_combined pNpS`))+geom_density()

all_interaction_functioning_Genes_cleaned <- read_delim("../data_zenodo/non_genomic_data/all_interaction_functioning_Genes_cleaned.txt",   "\t", escape_double = FALSE, col_names = c("geness","Name"),  trim_ws = TRUE)





all_interaction_functioning_Genes_cleaned$gene <-  str_split_fixed(all_interaction_functioning_Genes_cleaned$geness, fixed("_"), 2)[,2]
all_interaction_functioning_Genes_cleaned$species <-  str_split_fixed(all_interaction_functioning_Genes_cleaned$geness, fixed("_"), 2)[,1]

all_interaction_functioning_Genes_cleaned <- all_interaction_functioning_Genes_cleaned %>% select(-geness)


GENES_Sterm_extended <- merge(RunPogenome_Sterm_pNpS_per_gene,all_interaction_functioning_Genes_cleaned,by.x="Gene",by.y="Name",all.x = TRUE)
GENES_Ldel_extended <- merge(RunPogenome_Ldel_pNpS_per_gene,all_interaction_functioning_Genes_cleaned,by.x="Gene",by.y="Name",all.x = TRUE)


# GENES_Ldel_extended <- merge(GENES_Ldel,RunPogenome_Ldel_pNpS_per_gene,by="Gene",all = TRUE)
# table(GENES_Ldel$gene)
# GENES_Sterm_extended  <- merge(GENES_Sterm,RunPogenome_Sterm_pNpS_per_gene,by="Gene",all = TRUE)
# table(GENES_Sterm$gene)


GENES_Sterm_reduced <- GENES_Sterm_extended %>% filter(!is.na(gene)) %>% filter(!is.na(Num_loci)) 
GENES_Ldel_reduced <- GENES_Ldel_extended %>% filter(!is.na(gene)) %>% filter(!is.na(Num_loci))
GENES_Ldel_reduced$`All_samples_combined pNpS`[is.na(GENES_Ldel_reduced$`All_samples_combined pNpS`)] <- 0
GENES_Sterm_reduced$`All_samples_combined pNpS`[is.na(GENES_Sterm_reduced$`All_samples_combined pNpS`)] <- 0

GENES_Sterm_reduced_prep <- GENES_Sterm_reduced %>% select(c(gene,`All_samples_combined pNpS`,species,Num_loci))
GENES_ldel_reduced_prep <- GENES_Ldel_reduced %>% select(c(gene,`All_samples_combined pNpS`,species,Num_loci))




GENES_long <- rbind(GENES_Sterm_reduced_prep,GENES_ldel_reduced_prep) #%>% add_column(functioning="protocooperation\nrelated")

GENES_long$functioning <- ifelse(grepl("pep",GENES_long$gene),"peptidase","protocooperation\nrelated")


GENES_long <- GENES_long[
  order( GENES_long[,5] ),
]



ggheatmap_CRISPR <- ggplot(GENES_long, aes(species, gene,color = `All_samples_combined pNpS`))+
 # geom_tile(color = "white",size=1.1,shape="circle")+
 geom_point(size=8,shape="circle")+
  geom_point(shape = 1,size = 8,colour = "black")+
  # theme_classic()+
 # scale_fill_gradient2(low = "grey", high = "red",
  # midpoint = 90, limit = c(80,100), space = "Lab",
  # name="protein id",na.value = 'white',colour="black",pch=21) +
   scale_color_gradient2(low = "red", high = "grey",
  midpoint = 1, limit = c(0,2), space = "Lab",
  name="pN/pS",na.value = 'white') +
  # scale_fill_distiller(name = "SNPs", palette = "Blues", direction = -1)+
  # scale_fill_viridis(alpha=0.8)+
  labs(legend="CRISPR ID",x="",y="")+
  # scale_fill_distiller(name = "SNPs", palette = "Viridis", direction = -1)+
  # scale_fill_distiller(name = "ANI", palette = "Reds", direction = 1)+
  theme_minimal()+ # minimal theme
 theme(axis.text.y = element_text(size = 12),
      axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1))+
 coord_fixed()
# Print the heatmap
print(ggheatmap_CRISPR)


##===================================================
##plot
##=================================================== 

  svg("../03_results//HEATMAP_interactionGenes_pN_pS.svg",width=3,height=9)

print(ggheatmap_CRISPR)

 dev.off()
 
##===================================================
##distribution
##=================================================== 

 
GENES_Sterm_reverse <- GENES_Sterm_extended %>% filter(is.na(gene)) %>% filter(!is.na(Num_loci)) 
GENES_Ldel_reverse <- GENES_Ldel_extended %>% filter(is.na(gene)) %>% filter(!is.na(Num_loci))
GENES_Ldel_reverse$`All_samples_combined pNpS`[is.na(GENES_Ldel_reverse$`All_samples_combined pNpS`)] <- 0
GENES_Sterm_reverse$`All_samples_combined pNpS`[is.na(GENES_Sterm_reverse$`All_samples_combined pNpS`)] <- 0

GENES_Sterm_reverse_prep <- GENES_Sterm_reverse %>% select(c(gene,`All_samples_combined pNpS`,species,Num_loci))
GENES_ldel_reverse_prep <- GENES_Ldel_reverse %>% select(c(gene,`All_samples_combined pNpS`,species,Num_loci))


GENES_long_reverse <- rbind(GENES_Sterm_reverse_prep,GENES_ldel_reverse_prep) %>% add_column(functioning="other function")

GENES_complete <- rbind(GENES_long_reverse,GENES_long)

ggplot(GENES_complete,aes(x=`All_samples_combined pNpS`))+geom_density()+facet_wrap(~functioning)
 
ggplot(GENES_complete,aes(x=`All_samples_combined pNpS`,y=Num_loci))+geom_point()+facet_wrap(~functioning)


  GENES_complete$functioning <- factor(GENES_complete$functioning, levels=c("protocooperation\nrelated","peptidase","other function"))


colorssss <- c("red","orange","grey88")
dnDSplot <- ggplot(GENES_complete,aes(x=`All_samples_combined pNpS`,y=Num_loci,fill=functioning,color=functioning))+geom_point(size=2)+theme_classic()+labs(x="pN/pS",y="Number of mutations")+scale_fill_manual(values = colorssss)+scale_color_manual(values = colorssss)+theme(legend.title = element_blank())
dnDSplot


  svg("../03_results/dot_plot_pN_pS.svg",width=4.5,height=4)

dnDSplot

 dev.off()

1.2 Figure 2

Figure 2. Metagenomic sampling design and species abundance. A) The starter culture propagation scheme as applied in the cheese starter culture production. The samples subjected to metagenomic sequencing are indicated by darker colors and labelled with numbers. Every propagation cycle includes a freeze drying, reactivation, and working stock step. From the working stock, commercial starter cultures for weekly shipments to cheesemakers are produced. The propagation experiment was carried out in the same way as in production and in five replicates corresponding to samples 7-11.The numbers between the working stock (x) indicate the number of cycles in between. B) Relative abundance of the two bacterial species in the eleven starter cultures samples (1-6, historical samples; 7-11, replicates of the propagation experiment). C) Bacterial counts throughout the propagation experiment for both species and the five replicates (lines are colored according to species and points according to samples within the propagation cycle Fig. 2A). D) Acidification potential throughout the propagation experiment, as measured by pH reached after 18h incubation at 37°C in sterile milk..

Figure 2. Metagenomic sampling design and species abundance. A) The starter culture propagation scheme as applied in the cheese starter culture production. The samples subjected to metagenomic sequencing are indicated by darker colors and labelled with numbers. Every propagation cycle includes a freeze drying, reactivation, and working stock step. From the working stock, commercial starter cultures for weekly shipments to cheesemakers are produced. The propagation experiment was carried out in the same way as in production and in five replicates corresponding to samples 7-11.The numbers between the working stock (x) indicate the number of cycles in between. B) Relative abundance of the two bacterial species in the eleven starter cultures samples (1-6, historical samples; 7-11, replicates of the propagation experiment). C) Bacterial counts throughout the propagation experiment for both species and the five replicates (lines are colored according to species and points according to samples within the propagation cycle Fig. 2A). D) Acidification potential throughout the propagation experiment, as measured by pH reached after 18h incubation at 37°C in sterile milk..

1.2.1 taxon plot

Do this by looking at the coverage of S.thermophilus and L.delbrueckii in the different metagenomic samples.


###================
##description file
###================
threads=37
samplesss=/archiv/Projects/2019_RMK202_analysis/01_log/onlyMeta_samples.txt  ##the file with all sample names
logFilelocation=/archiv/Projects/2019_RMK202_analysis/01_log
BaseLocation=/archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final
Assembly=/archiv/Projects/2019_Nano_Meta/02_flye_Assembly/02_raw_demultiplexed/di_K2_6h/Polishing_FINAL/FINAL_assembly_withPlasmids_PGAP/RMK202_MAG_assembly.fasta
samplesss=/archiv/Projects/2019_RMK202_analysis/01_log/onlyMeta_samples_withEvolution.txt  ##the file with all sample names

###================
##bring all CDS gffs together
###================
mkdir -p  /archiv/Projects/2019_RMK202_analysis/00_FINAL/07_abundances/prep/

grep "^CP" -v /archiv/Projects/2019_RMK202_analysis/phage_annotation/phaster/ZZ_be76e5e924.PHASTER/gene_CLEANED_FINAL_ALL.gff |awk -F "\t" '{OFS="\t"}{print $1,$2,$3}' > /archiv/Projects/2019_RMK202_analysis/00_FINAL/07_abundances/prep/genes_for_Abundances.bed


grep "^#" -v /archiv/Projects/2019_Pilotplan/11_PGAP/final/sterm/S_thermophilus_RMK202.current.gb.gff |grep "^CP" |awk -F "\t" '{OFS="\t"}{if($3=="CDS")print $1,$4,$5}' >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/07_abundances/prep/genes_for_Abundances.bed


grep "^#" -v /archiv/Projects/2019_Pilotplan/11_PGAP/final/ldel/L_delbrueckii_RMK202.current.gb.gff |grep "^CP" |awk -F "\t" '{OFS="\t"}{if($3=="CDS")print $1,$4,$5}' >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/07_abundances/prep/genes_for_Abundances.bed



###================
##mapping
###================

##============
##mapping to reference
##============
names=RMK202
rm -r /archiv/Projects/2019_RMK202_analysis/00_FINAL/07_abundances/mapping/
mkdir -p /archiv/Projects/2019_RMK202_analysis/00_FINAL/07_abundances/mapping/

for names in $(cat ${samplesss} )
  do
  echo ${num}"/16  :" ${names}
  num=$((num+1))



bedtools coverage -bed -a /archiv/Projects/2019_RMK202_analysis/00_FINAL/07_abundances/prep/genes_for_Abundances.bed -b ${BaseLocation}/${names}/bwaMapping2DB/${names}_mapping2ref.bam  > \
  /archiv/Projects/2019_RMK202_analysis/00_FINAL/07_abundances/mapping/${names}2bacteriaDB.bed 
  
  #${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted.bam
    cat /archiv/Projects/2019_RMK202_analysis/00_FINAL/07_abundances/mapping/${names}2bacteriaDB.bed |awk -F "[\t]" -v namess="$names"  'BEGIN{OFS="\t"}{print $0,namess}'  >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/07_abundances/mapping/all_2bacteria_and_phages_from_MAG.bed 
    
   done 
   
##===================================
#-------------file import

  read_count <- read_delim("../data_zenodo/non_genomic_data/Coverage_bacteria_and_phages_from_MAG.bed","\t", escape_double = FALSE, col_names = c("chr","start","end","count","length_mapped","geneLength","unknown","sample"),trim_ws = TRUE)


  # read_count <- read_delim("/home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/07_abundances/mapping/all_2bacteria_and_phages_from_MAG.bed","\t", escape_double = FALSE, col_names = c("chr","start","end","count","length_mapped","geneLength","unknown","sample"),trim_ws = TRUE)

table(read_count$chr)


  table(read_count$chr)
  
  read_count$geneCoverage  <- (read_count$count*600)/read_count$geneLength
  

  library(dplyr)
  
  
  # all_final <- read_count %>% 
  #   group_by(sample,chr) %>% 
  #   dplyr::summarize(median = median(geneCoverage)) 
  # 
   all_final <- read_count %>% 
    group_by(sample,chr) %>% 
    dplyr::summarize(median = median(geneCoverage)) %>% filter(chr %in% c("CP046131","CP046134","Lactobacillus_phage_1","Streptococcus_phage_1","Streptococcus_phage_2"))
  
  
    total_samples_sumTreatment <- aggregate(. ~sample, data=all_final[,c("sample","median")], sum, na.rm=TRUE)
  
  all_final$total_coverage <- total_samples_sumTreatment[match(all_final$sample,total_samples_sumTreatment$sample),"median"]
  all_final$percent_coverage <- 100*(all_final$median/all_final$total_coverage)

  
  table(all_final$sample)
  all_final <- all_final %>% filter(! sample %in% c("th_K2_8h","di_K2_6h"))
  all_final$sample <- factor(all_final$sample, levels=(c("lyo202_96","Lyo_202_2012","Konserve_202","Lyo_202_2014","RMK202","Versand_202","G1_6_18","G2_6_18","G3_6_18","G4_6_18","G5_6_18")))

# table(total_samples$phage) %>% length()
write.table(all_final,"../03_results/coverage_rmk202.tsv",sep = "\t",quote = FALSE,col.names = FALSE)

write.table(all_final,"../03_results/coverage_rmk202_n32.tsv",sep = "\t",quote = FALSE,col.names = FALSE)

 # all_colours <-c("#C5F6FA" ,"#6CF5A3" ,"#36E37B" ,"#10B552", "darkorange",  "#FAA0A0","#EB4D4D")
 all_colours <-c("#C5F6FA" ,"#6CF5A3" ,"#36E37B" ,"#10B552", "darkorange",  "#FAA0A0","#EB4D4D")


##----------------change name
library(plyr)
library(dplyr)
all_final$sample <- revalue(all_final$sample, c("lyo202_96"="Lyo 1996","Lyo_202_2012"="Lyo\n2012", "Lyo_202_2014"="Lyo\n2014", "Konserve_202"="working\nstock", "RMK202"="starter\nculture\n2012", "Versand_202"="starter\nculture\n2018"))
##----------------plot
# all_final <- all_final %>% filter(!sample %in% c("cheesemaking\nday1","cheesemaking\nday2"))

levels(all_final$chr)
# all_final$species <- revalue(all_final$species, c("lyo202_96"="Lyo\n1996","Lyo_202_2012"="Lyo\n2012", "Lyo_202_2014"="Lyo\n2014", "Konserve_202"="working\nstock", "RMK202"="starter\nculture\n2012", "Versand_202"="starter\nculture\n2018","di_K2_6h"="cheesemaking\nday1","th_K2_8h"="cheesemaking\nday2"))
#all_final$chr <- factor(all_final$chr, levels=c("L_del_phage_01" , "L_del_plasmid_02" ,"L_plasmid_RMK202", "L_del_plasmid_01","S_term_phage_01","S_term_plasmid_01","L_delbrueckii_RMK202","S_thermophilus_RMK202"))

all_final$chr <- factor(all_final$chr, levels=c("Lactobacillus_phage_2" , "Lactobacillus_phage_1" ,"CP046133", "CP046132","Streptococcus_phage_2","Streptococcus_phage_1","CP046135","CP046131","CP046134"))



# all_colours
all_colours_new <-  c("#36E37A","#C5F6FA","#6CF5A3","#36E37B","orange","darkorange","#FAA0A0", "#10B552","#EB4D4D")
  
all_colours_new <-  c("#dbece1","#a0cbd2","#6bf5a2","#66c264","#ffa300","#ff8a00","#ff5200", "#10B552","#EB4D4D")
  
all_colours_new <-  c("#a0cbd2","#ffa300","#ff8a00", "#10B552","#EB4D4D")

# c("#C5F6FA","#6CF5A3","#36E37B", "#10B552","darkorange","#FAA0A0","#EB4D4D")
# c("L_del_phage_01" , "L_del_plasmid_02" ,"L_plasmid_RMK202", "L_del_plasmid_01","S_term_phage_01","S_term_plasmid_01","L_delbrueckii_RMK202","S_thermophilus_RMK202")

##===============================
##only Bacteria
##===============================

all_final_bacteria

  total_samples_sumTreatment <- aggregate(. ~sample, data=all_final_bacteria[,c("sample","median")], sum, na.rm=TRUE)

all_final_bacteria$total_coverage <- total_samples_sumTreatment[match(all_final_bacteria$sample,total_samples_sumTreatment$sample),"median"]
all_final_bacteria$percent_coverage <- 100*(all_final_bacteria$median/all_final_bacteria$total_coverage)



# all_final$chr <- factor(all_final$chr, levels=c("Lactobacillus_phage_2" , "Lactobacillus_phage_1" ,"CP046133", "CP046132","Streptococcus_phage_2","Streptococcus_phage_1","CP046135","CP046131","CP046134"))



# all_colours
# all_colours_new <-  c("#36E37A","#C5F6FA","#6CF5A3","#36E37B","orange","darkorange","#FAA0A0", "#10B552","#EB4D4D")
  
all_colours_new <-  c( "#10B552","#EB4D4D")
  
# c("#C5F6FA","#6CF5A3","#36E37B", "#10B552","darkorange","#FAA0A0","#EB4D4D")
# c("L_del_phage_01" , "L_del_plasmid_02" ,"L_plasmid_RMK202", "L_del_plasmid_01","S_term_phage_01","S_term_plasmid_01","L_delbrueckii_RMK202","S_thermophilus_RMK202")

PrelAbundance_bacteria <-  ggplot( data = all_final_bacteria,aes(y = percent_coverage, x = sample, group=interaction(chr),fill = chr))+ geom_bar( stat="identity")+
    labs("",
         x="",
         y="relative abundance")+
    theme_classic()+
   # scale_color_viridis(discrete=TRUE)+
   scale_fill_manual(values=all_colours_new)+
  # scale_fill_viridis(discrete=TRUE)+
    theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9),
        # axis.text.x = element_blank(),
          legend.position="right",
          #legend.justification=c(1,1), legend.position=c(1,1),
          legend.title = element_blank()
          )

library(patchwork)

  PrelAbundance_bacteria
  svg("../03_results/relative_abundance_all.svg",width=10,height=4.5)
#    # png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 1000, height = 800,res=300)
# 
(PrelAbundance+theme(legend.position = "none"))+(PrelAbundance_bacteria+theme(legend.position = "none"))+PrelAbundance
# 
dev.off()

1.2.2 propagation experiment

In this project we have created a propagation experiment. Here we analyse the phenotypic data.

rmk202_test_cellCounts <- read_csv("../data_zenodo/non_genomic_data/rmk202_propagation_experiment_data.csv",skip = 2) %>% filter(sample!="sample") %>% select(c("sample","step","pH","average BM","average SR.9.3","calculated Ldel","generations_all" ,"generations_sterm","generations_ldel","cummulative_generation_all"  ,"cummulative_generation_sterm","cummulative_generation_ldel","doublingTime_all","doublingTime_sterm","doublingTime_ldel","survival_rate_all","survival_rate_sterm","survival_rate_ldel" ))

rmk202_test_cellCounts$sampleNAME <- str_split_fixed(rmk202_test_cellCounts$sample, fixed("_"), 3)[,1]
rmk202_test_cellCounts$week <- str_split_fixed(rmk202_test_cellCounts$sample, fixed("_"), 3)[,2]
rmk202_test_cellCounts$passage <- str_split_fixed(rmk202_test_cellCounts$sample, fixed("_"), 3)[,3]

###------------------------cumulative generation per species at end
cumGenData <- rmk202_test_cellCounts %>% filter(passage==18) %>% select(sample, cummulative_generation_sterm, cummulative_generation_ldel) %>% gather(.,feature, generations,c("cummulative_generation_sterm", "cummulative_generation_ldel"), factor_key=TRUE,na.rm = TRUE)

cumGenData$feature  <- plyr::revalue(cumGenData$feature, c("cummulative_generation_sterm"="S.thermophilus","cummulative_generation_ldel"="L. delbrueckii"))

colorsSpecies <- (c("#EB4D4D","#10B552") ) 

cumGenPLOT <- ggplot(cumGenData,aes(x=feature,group=feature,y=generations,fill=feature))+geom_boxplot()+theme_classic()+labs(y="number of generations\n at the end",x="")+theme(axis.text.x=element_blank(),legend.position = "none")+scale_fill_manual(values=colorsSpecies)
cumGenPLOT


###------------------------number of generations per species 

GenData <- rmk202_test_cellCounts %>% filter(passage!=1) %>% select(sample, generations_sterm, generations_ldel) %>% gather(.,feature, generations,c("generations_sterm", "generations_ldel"), factor_key=TRUE,na.rm = TRUE)

GenData$feature  <- plyr::revalue(GenData$feature, c("generations_sterm"="S.thermophilus","generations_ldel"="L. delbrueckii"))


GenPLOT <- ggplot(GenData,aes(x=feature,group=feature,y=generations,fill=feature))+geom_boxplot()+theme_classic()+labs(y="number of generations\n per propagation step",x="")+theme(axis.text.x=element_blank(),legend.position = "none")+scale_fill_manual(values=colorsSpecies)
GenPLOT

###------------------------number of generations per species 

GenData <- rmk202_test_cellCounts %>% filter(passage!=1) %>% select(sample, generations_sterm, generations_ldel) %>% gather(.,feature, generations,c("generations_sterm", "generations_ldel"), factor_key=TRUE,na.rm = TRUE)

GenData$feature  <- plyr::revalue(GenData$feature, c("generations_sterm"="S.thermophilus","generations_ldel"="L. delbrueckii"))
mean(GenData$generations)
GenPLOT <- ggplot(GenData,aes(x=feature,group=feature,y=generations,fill=feature))+geom_boxplot()+theme_classic()+labs(y="Number of generations\nRelative abundance step",x="")+theme(axis.text.x=element_blank(),legend.position = "none")+scale_fill_manual(values=colorsSpecies)
GenPLOT

 GenData %>% 
    group_by(feature) %>% 
    dplyr::summarize(median = mean(generations))
  
 ###------------------------average CFU after second passage
colnames(rmk202_test_cellCounts)
CountData <- rmk202_test_cellCounts %>% filter(step=="second_passage") %>% select("sample", "average SR.9.3", "calculated Ldel") %>% gather(.,feature, generations,c("average SR.9.3", "calculated Ldel"), factor_key=TRUE,na.rm = TRUE)

CountData$feature  <- plyr::revalue(CountData$feature, c("average SR.9.3"="S.thermophilus","calculated Ldel"="L. delbrueckii"))

CountPLOT <- ggplot(CountData,aes(x=feature,group=feature,y=generations,fill=feature))+geom_boxplot()+theme_classic()+labs(y="Bacterial count of\nworking stock [CFU/ml]",x="")+theme(axis.text.x=element_blank(),legend.position = "none")+scale_fill_manual(values=colorsSpecies)
CountPLOT

 CountData %>% 
    group_by(feature) %>% 
    dplyr::summarize(median = mean(generations))
  
 CountData %>% 
    group_by(feature) %>% 
    dplyr::summarize(median = mean(generations),sd = sd(generations))
  
 ###------------------------death rate after freeze drying
 rmk202_test_cellCounts$step
SurvivalData <- rmk202_test_cellCounts %>% filter(step=="freeze_dry") %>% select("sample", "survival_rate_all") %>% add_column(species="all")

# CountData$feature  <- plyr::revalue(CountData$feature, c("average SR.9.3"="S.thermophilus","calculated Ldel"="L. delbrueckii"))

SurvivalPLOT <- ggplot(SurvivalData,aes(x=species,group=species,y=survival_rate_all,fill=species))+geom_boxplot()+theme_classic()+labs(y="Survivial rate\n [%]",x="")+theme(axis.text.x=element_blank(),legend.position = "none")+scale_fill_manual(values="grey88")
SurvivalPLOT


 SurvivalData %>% 
    group_by(species) %>% 
    dplyr::summarize(median = mean(survival_rate_all))
  
 
 ###------------------------CFU over time
 rmk202_test_cellCounts$cummulative_generation_all
CFUData <- rmk202_test_cellCounts  %>% filter(passage!="17") %>% filter(step!="freeze_dry") %>% select("sample","step","cummulative_generation_all","sampleNAME","passage", "average SR.9.3", "calculated Ldel") %>% add_column(species="all")%>% gather(.,feature, generations,c("average SR.9.3", "calculated Ldel"), factor_key=TRUE,na.rm = TRUE)
# CFUData[which(is.na(CFUData$cummulative_generation_all)),]
CFUData[which(is.na(CFUData$cummulative_generation_all)),"cummulative_generation_all"] <- 0



CFUData$feature  <- plyr::revalue(CFUData$feature, c("average SR.9.3"="S.thermophilus","calculated Ldel"="L. delbrueckii"))
CFUData$passage <- as.double(CFUData$passage)
CFUData$sampleNAME <- as.factor(CFUData$sampleNAME)


CFUPLOT <- ggplot(CFUData,aes(x=cummulative_generation_all,group=sampleNAME,y=generations))+facet_wrap(~feature,scales="free_y",ncol=1)+geom_point(aes(color=step),size=2)+geom_line(aes(color=feature),alpha=0.4,size=1)+theme_classic()+labs(y="Bacterial count\n[CFU/ml]",x="generations")+theme(axis.text.x = element_text(size=9),legend.position = "top",legend.title = element_blank())+scale_fill_manual(values=colorsSpecies)+scale_color_manual(values=c("start"="black","first_passage"="#d9d9d9","second_passage"="#b3b3ff","freeze_dry"="#ff0101","S.thermophilus"="#EB4D4D","L. delbrueckii"="#10B552"))
CFUPLOT


  ###------------------------pH over time

pHData <- rmk202_test_cellCounts  %>% filter(step!="freeze_dry")%>% select("sample","step","sampleNAME","cummulative_generation_all", "pH") 
pHData[which(is.na(pHData$cummulative_generation_all)),"cummulative_generation_all"] <- 0

# pHData$passage <- as.double(pHData$passage)
pHData$sampleNAME <- as.factor(pHData$sampleNAME)


pHPLOT <- ggplot(pHData,aes(x=cummulative_generation_all,group=sampleNAME,y=pH))+geom_point(aes(color=step),size=2)+geom_line(color="grey",alpha=0.4)+theme_classic()+labs(y="pH",x="generations")+theme(axis.text.x = element_text(size=9),legend.position = "none")+scale_color_manual(values=c("start"="black","first_passage"="#d9d9d9","second_passage"="#b3b3ff","freeze_dry"="#ff0101","S.thermophilus"="#EB4D4D","L. delbrueckii"="#10B552"))
pHPLOT

 pHData %>% 
    dplyr::summarize(median = mean(pH),sd = sd(pH))
  
###------------------------put figurers together
 
 
 plot1 <- SurvivalPLOT+CountPLOT+GenPLOT+cumGenPLOT+plot_layout(ncol = 4,widths =  c(0.5,1,1,1))

plotFinal <- CFUPLOT+plot1+pHPLOT+plot_layout(nrow = 3,heights = c(1,1,0.5))
plotFinal

svg("../03_results/evolutionExperiment_plot.svg",width=6,height=8)
plotFinal
dev.off()

1.3 Figure 3

Figure 3. Strain-level diversity of S. thermophilus in cheese starter cultures. A) Alternative allele frequencies of all S. thermophilus SNVs over the metagenomic samples. Recurring SNVs from different samples are connected with a line. Clustering of lines indicates a large amount of SNVs with similar frequencies suggesting genomic coupling. Sample labels on the x-axis correspond to samples highlighted in Fig. 2A). Phylogeny of the isolated S. thermophilus strains based on maximum likelihood analysis on 1788 core genes. The isolates split into four lineages indicated by different color shadings. Strains sequenced with Nanopore are labelled with an asterisk. Values on branches indicate bootstrap values (100 replicates). C) Relative abundance of each of the four sub-lineages of S. thermophilus across the eleven metagenomes as based on the average frequency of lineage-specific SNVs identified on the basis of the isolates in Fig 3B.

Figure 3. Strain-level diversity of S. thermophilus in cheese starter cultures. A) Alternative allele frequencies of all S. thermophilus SNVs over the metagenomic samples. Recurring SNVs from different samples are connected with a line. Clustering of lines indicates a large amount of SNVs with similar frequencies suggesting genomic coupling. Sample labels on the x-axis correspond to samples highlighted in Fig. 2A). Phylogeny of the isolated S. thermophilus strains based on maximum likelihood analysis on 1788 core genes. The isolates split into four lineages indicated by different color shadings. Strains sequenced with Nanopore are labelled with an asterisk. Values on branches indicate bootstrap values (100 replicates). C) Relative abundance of each of the four sub-lineages of S. thermophilus across the eleven metagenomes as based on the average frequency of lineage-specific SNVs identified on the basis of the isolates in Fig 3B.

1.3.1 SNV calling and phasing

This is a very extensive analysis. It consist out of:

  • mapping raw reads with bwa mem
  • calling and filtering SNVs with freebayes and vcftools
  • plot the alternative allel frequency plot with r
###================
##description file
###================
threads=37
samplesss=/archiv/Projects/2019_RMK202_analysis/01_log/names_extended.txt  ##the file with all sample names
logFilelocation=/archiv/Projects/2019_RMK202_analysis/01_log
BaseLocation=/archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final
Assembly=/archiv/Projects/2019_Nano_Meta/02_flye_Assembly/02_raw_demultiplexed/di_K2_6h/Polishing_FINAL/FINAL_assembly_withPlasmids_PGAP/RMK202_MAG_assembly_phagesOrientated.fasta

##!!!!!!!!!!!!!change names of Forward and Reverse reads
names=G4_6_18
r1=/home/vincent/Projects/2020_StarterEvolution/01_data/20200929_Novogene/X204SC20090774-Z01-F001/trimm_Galore/gz/${names}/${names}-R1_val_1.fq
r2=/home/vincent/Projects/2020_StarterEvolution/01_data/20200929_Novogene/X204SC20090774-Z01-F001/trimm_Galore/gz/${names}/${names}-R2_val_2.fq

###================
##merge genome
#there was a problem in the G4 sample  so I had to change the fastq input
###================

#cp /archiv/Projects/2019_RMK202_analysis/01_log/locationRawData_allSamples.txt /archiv/Projects/2019_RMK202_analysis/01_log/locationRawData_allSamples_G4_corrected.txt
#/archiv/Projects/2019_pilotplant/01_rawData/02_mergedfastq/kneaddata/G4_6_18/G4_6_18/G4_6_18-*_kneaddata_paired_1.fastq
#/archiv/Projects/2019_pilotplant/01_rawData/02_mergedfastq/kneaddata/G4_6_18/G4_6_18/G4_6_18-*_kneaddata_paired_2.fastq
###================
##script
###================
##============
##mapping to reference
##============
bwa index $Assembly
# name_folder=02_againstpolished_single_rmk202_final_kneaddata_withStrains 

mkdir -p ${logFilelocation}

num=1

for names in $(cut -f 1 /archiv/Projects/2019_RMK202_analysis/01_log/locationRawData_allSamples.txt)
#for names in $(ls /archiv/Projects/2019_pilotplant/01_rawData/02_mergedfastq/kneaddata/ |grep "^G"|head -2)

  do
  echo ${num}"/16  :" ${names}
  num=$((num+1))
  rm -r ${BaseLocation}/${names}/bwaMapping2DB/

  mkdir -p ${BaseLocation}/${names}/bwaMapping2DB/
  
  r1=$(grep "^${names}" /archiv/Projects/2019_RMK202_analysis/01_log/locationRawData_allSamples.txt |cut -f 2)
   r2=$(grep "^${names}" /archiv/Projects/2019_RMK202_analysis/01_log/locationRawData_allSamples.txt |cut -f 3)
   
 # bwa mem -t ${threads} /archiv/Projects/2019_Nano_Meta/02_flye_Assembly/02_raw_demultiplexed/di_K2_6h/Polishing_FINAL/FINAL_assembly_withPlasmids/di_rmk202_MAG_reference_polished.fasta
  bwa mem -t ${threads} ${Assembly} \
    ${r1} ${r2} | samtools sort -@${threads} -O BAM -o ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted.bam - 

samtools view -b -f 4 ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted.bam > ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted_unmapped.bam

bedtools bamtofastq -i ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted_unmapped.bam -fq ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted_unmapped.fq

rm ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted_unmapped.bam &

##-----Qualimap

mkdir -p ${BaseLocation}/${names}/bwaMapping2DB//bamqc

qualimap bamqc -bam ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted.bam -outdir ${BaseLocation}/${names}/bwaMapping2DB//bamqc --java-mem-size=80G


 mkdir -p ${BaseLocation}/log
echo ${BaseLocation}"/"${names}"/bwaMapping2DB/bamqc" >> ${BaseLocation}/log/multiqc_logfile_finalGenome.txt
   done 
   
   

##-------------------------------------------------  
##-----mmaping depth
##-------------------------------------------------  
rm ${BaseLocation}/complete_depth.txt
for names in $(cut -f 1 /archiv/Projects/2019_RMK202_analysis/01_log/locationRawData_allSamples.txt)
  do
  echo $names
#samtools depth -a ${BaseLocation}/${names}/bwaMapping2DB/${names}_mapping2ref.bam | grep "^Streptococcus_ph" |awk -F "\t" -v sampsss="$names" '{OFS="\t"}{print $0,sampsss}' >> ${BaseLocation}/complete_depth.txt
samtools depth -a ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted.bam | grep "^Streptococcus_ph" |awk -F "\t" -v sampsss="$names" '{OFS="\t"}{print $0,sampsss}' >> ${BaseLocation}/complete_depth.txt


done

##-------------------------------------------------  
##-----check read coverage
##-------------------------------------------------  
name_folder=02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final 
threads=37
names=Konserve_202


num=1
rm -r ${BaseLocation}/log/mappings.txt

#for names in $(cat /archiv/Projects/2019_RMK202_analysis/01_log/names_extended.txt)
for names in $(cat /archiv/Projects/2019_RMK202_analysis/01_log/onlyMeta_samples.txt)
  do
mapped=$(grep "     number of mapped reads = " ${BaseLocation}/${names}/bwaMapping2DB//bamqc/genome_results.txt  |cut -d '(' -f 2 | sed 's/%)//g')

mkdir -p ${BaseLocation}/log
echo -e ${names}"\tonlyMeta\t"${mapped} >> ${BaseLocation}/log/mappings.txt

done

This is a good (blog)[https://bioinformatics-core-shared-training.github.io/cruk-summer-school-2017/Day2/vcf-intro.nb.html] on how to use freebayes and how to parse the output. Finally I will do freebayes on all samples simultaniously.

Interesting output from column 11 on:

GT Genotype v :1 DP Read Depth :1040 DPR Number of observation for each allele : 1,1037 RO Reference allele observation count :1 QR Sum of quality of the reference observations :10 AO Alternate allele observation count :1037 QA Sum of quality of the alternate observations :36917 GL Genotype Likelihood, log10-scaled likelihoods of the data given the called genotype for each possible genotype generated from the reference and alternate alleles given the sample ploidy

Very nice blog illustrating good filtering practices.

Here we do snpEff.



###================
##description file
###================
threads=37
samplesss=/archiv/Projects/2019_RMK202_analysis/01_log/names_extended.txt  ##the file with all sample names
logFilelocation=/archiv/Projects/2019_RMK202_analysis/01_log
BaseLocation=/archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP
Assembly=/archiv/Projects/2019_Nano_Meta/02_flye_Assembly/02_raw_demultiplexed/di_K2_6h/Polishing_FINAL/FINAL_assembly_withPlasmids_PGAP/di_rmk202_MAG_reference_polished.fasta
FREEBAYES_out=freebayesOuput_WithONT_Parallel_default
sampleShort=PROKKA_meta_all ##For st_thermophilus



threads=37
samplesss=/archiv/Projects/2019_RMK202_analysis/01_log/names_extended.txt  ##the file with all sample names
logFilelocation=/archiv/Projects/2019_RMK202_analysis/01_log
BaseLocation=/archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final
Assembly=/archiv/Projects/2019_Nano_Meta/02_flye_Assembly/02_raw_demultiplexed/di_K2_6h/Polishing_FINAL/FINAL_assembly_withPlasmids_PGAP/RMK202_MAG_assembly.fasta

##!!!!!!!!!!!!!change names of Forward and Reverse reads


##=======================================================================================================================
##-----------------------------------------------all together----------------------------------------------
##=======================================================================================================================
##==================
##Building Database
##==================
#/archiv/Projects/2018_Culturomics/genomes/Lactobacillus_delbrueckii_RMK202.fna
#/archiv/Projects/2018_Culturomics/genomes/Streptococcus_thermophilus_RMK202.fna

##-------------
##01_add genome name to
##-------------

vim /home/vincent/miniconda2/share/snpeff-4.3.1t-2/snpEff.config
#S_thermophilus_mag_rmk202.genome : S_thermophilus_mag_rmk202
#L_delbrueckii_mag_rmk202.genome : L_delbrueckii_mag_rmk202
#Streptococcus_phage_rmk202.genome : Streptococcus_phage_rmk202
#PGAP_meta_all.genome : PGAP_meta_all
PROKKA_meta_all.genome : PROKKA_meta_all

##-------------
#02_put genome fasta and gff into right directory:
##-------------

#sampleShort=S_thermophilus_mag_rmk202
#sampleShort=CP046134 ##For st_thermophilus
#sampleShort=PROKKA_meta_all/
sampleShort=PGAP_meta_all

#/archiv/Projects/2019_Nano_Meta/02_flye_Assembly/02_raw_demultiplexed/di_K2_6h/Polishing_FINAL/circlatorAfter//circlaring_ldel/final/L_delbrueckii_mag_rmk202.fasta 

##-------------
#03_put genome fasta 
##-------------
rm -r /home/vincent/miniconda2/share/snpeff-4.3.1t-2/data/${sampleShort}
mkdir -p /home/vincent/miniconda2/share/snpeff-4.3.1t-2/data/${sampleShort}

cat ${Assembly} > /home/vincent/miniconda2/share/snpeff-4.3.1t-2/data/${sampleShort}/sequences.fa



##-------------
#04 genes gff 
##-------------

#grep "^#" -v /archiv/Projects/2019_Nano_Meta/02_flye_Assembly/PGAP_assembly/sterm/S_thermophilus_RMK202.current.gff|grep -e "${sampleShort}" |cut -f 1|sort|uniq -c

#cat /archiv/Projects/2019_Nano_Meta/02_flye_Assembly/PGAP_assembly/sterm/S_thermophilus_RMK202.current.gff | grep -e "^${sampleShort}" |awk -F "\t" '{OFS"\t"}{if($3=="CDS") print $0}' > /home/vincent/miniconda2/share/snpeff-4.3.1t-2/data/${sampleShort}//genes.gff


##-------------
#04 all genes gff 
##-------------

##already preped in RAST chunk
 #cat /archiv/Projects/2019_Nano_Meta/02_flye_Assembly/02_raw_demultiplexed/di_K2_6h/Polishing_FINAL/spades_nonMapping_assembly/annotation_eggnog_rast/assembly_20191129/merged/all_genes_rmk202.gff |sed 's/gene/CDS/g' > /home/vincent/miniconda2/share/snpeff-4.3.1t-2/data/${sampleShort}//genes.gff

 awk -F "\t" '{OFS="\t"}{if( $3=="CDS")print $0}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Sterm/202-SMAG/PROKKA_04012020.gff|sed 's/^202-SMAG-1/CP046134/g'  > /home/vincent/miniconda2/share/snpeff-4.3.1t-2/data/${sampleShort}//genes.gff

 awk -F "\t" '{OFS="\t"}{if( $3=="CDS")print $0}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Ldel//202-LMAG/PROKKA_04012020.gff|sed 's/^202-LMAG-1/CP046131/g'  >> /home/vincent/miniconda2/share/snpeff-4.3.1t-2/data/${sampleShort}//genes.gff

##-------------
##05_build database
##-------------

cd /home/vincent/miniconda2/share/snpeff-4.3.1t-2/
java -jar snpEff.jar build -gff3 -v ${sampleShort}


##==================
##running snpEff
##==================

##-------------
##06_preperation of SNV file
##-------------

#grep -e "${sampleShort}" ${BaseLocation}/${FREEBAYES_out}/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL.recode.vcf > ${BaseLocation}/${FREEBAYES_out}/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_all.vcf

##-------------
##07 SNveff calling
##-------------

cd /home/vincent/miniconda2/share/snpeff-4.3.1t-2/
#rm -r ${sampleShort} ${BaseLocation}/${FREEBAYES_out}/SnpEff/${sampleShort}/
mkdir -p ${sampleShort} ${BaseLocation}/${FREEBAYES_out}/SnpEff/${sampleShort}/


#java -Xmx64g -jar /home/vincent/miniconda2/share/snpeff-4.3.1t-2/snpEff.jar ${sampleShort} /archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL.recode.vcf > ${BaseLocation}/${FREEBAYES_out}/SnpEff/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_snpeff.vcf

grep "phage" -v /archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL.recode.vcf > /archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL.recode_wophage.vcf


head -1 /archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_prepR.vcf|sed 's/\t/\n/g'|sed '1,9d' > /archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default/samples_names.txt


java -Xmx64g -jar /home/vincent/miniconda2/share/snpeff-4.3.1t-2/snpEff.jar ${sampleShort} \
/archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL.recode_wophage.vcf > /archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default//variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_snpeff.vcf


##-------------
##08_prep for R
##-------------

grep "^#" -v /archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default//variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_snpeff.vcf  | \
  awk -F "\t" 'BEGIN{OFS="\t"} {print $1,$2,$3,$4,$5,$6,$7,$8}' > /archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP_Final/freebayesOuput_WithONT_Parallel_default//variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_snpeff_prepforR.vcf


##==================
##transfer local 
##==================

threads=37
samplesss=/archiv/Projects/2019_RMK202_analysis/01_log/names_extended.txt  ##the file with all sample names
logFilelocation=/archiv/Projects/2019_RMK202_analysis/01_log
BaseLocation=/archiv/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_single_rmk202_final_kneaddata_withStrains_PGAP
Assembly=/archiv/Projects/2019_Nano_Meta/02_flye_Assembly/02_raw_demultiplexed/di_K2_6h/Polishing_FINAL/FINAL_assembly_withPlasmids_PGAP/di_rmk202_MAG_reference_polished.fasta
FREEBAYES_out=freebayesOuput_WithONT_Parallel_default

here we merge all annotation knowledge we have of the SNPs:

  1. SNPeffect
  2. eggnog annotation
  3. repeat annotation
  4. core genes

Thereafter we have a long list of information for every SNP for which we can go into R.

Hereafter we go into R and do the following filtering.

The coverage of the alternative allel must be at least 3 or zero.

We have to be careful when removing low quality SNP calls. As they seem to be not not low quality in the sense of low coverage but rather suported by only a fraction of samples. This makes sense for many snps which are only valied for Sterm and have NAs in the Ldel genomes. Therefore we rather select the SNP by selecting for a minimum of coverage supporting the SNP per sample.

!!IMPORTANT!! Before starting we have to add the following to the SNP.vcf:

  1. SNPeff
  2. if core or not with roary and interesect
  3. (if repeat within repeat)
library(tidyverse)
library(vcfR)
library(readr)
library(plyr)
library(dplyr)
library(ggplot2)
library(xlsx)

##==================
##01_import data
##==================
#snps_freebayes <- read_delim("/home/vincent/Desktop/Projects/2019_RMK202_analysis/06_snpCalling2ONT/variantCallsfreebayes_all_f_005_min_20x_prepforR.vcf", "\t", escape_double = FALSE, trim_ws = TRUE)
# snps_freebayes <- read_delim("/home/vincent/Desktop/Projects/2019_RMK202_analysis/06_snpCalling2ONT_new/variantCallsfreebayes_minAlt3x_0.05_minCov10x_QC_noINDEL_prepR_new.vcf", "\t", escape_double = FALSE, trim_ws = TRUE)
snps_freebayes <- read_delim("../data_zenodo/non_genomic_data/variantCallsfreebayes.txt", "\t", escape_double = FALSE, trim_ws = TRUE)
../data_zenodo/non_genomic_data/Coverage_bacteria_and_phages_from_MAG.bed


table(snps_freebayes$`#CHROM`)

snps_freebayes$MQM <- str_split_fixed(snps_freebayes$INFO, ";", 130)[,16] %>% gsub("MQM=","",.) 
snps_freebayes$MQM <- str_split_fixed(snps_freebayes$MQM, ",", 3)[,1] %>%  as.numeric()


# colnames(snps_freebayes)
# snps_freebayes[snps_freebayes$QUAL<=20,]
# snps_freebayes <- snps_freebayes[snps_freebayes$QUAL>20,]
# dim(snps_freebayes_new)
# dim(snps_freebayes)
# snps_freebayes <- subset(snps_freebayes, select=-c(di_K1_10h,th_K1_8h))


##==================
##01_filter SNVs MQM larger than 30 
##INFO=<ID=MQM,Number=A,Type=Float,Description="Mean mapping quality of observed alternate alleles">
##==================
tmp_remove <- snps_freebayes[which(snps_freebayes$MQM<30),]
table(tmp_remove$`#CHROM`)
snps_freebayes <- snps_freebayes[which(snps_freebayes$MQM>30),] %>% select(-MQM)
table(snps_freebayes$`#CHROM`)


##==================
##01_filter SNVs MQM larger than 30 
##INFO=<ID=MQM,Number=A,Type=Float,Description="Mean mapping quality of observed alternate alleles">
##==================

##==================
##02_long list and frequency calc
##==================

table(snps_freebayes$`#CHROM`)
##wide2long

 # snps_freebayes[,10]
snps_freebayes_long <- gather(snps_freebayes, sample, Snps, colnames(snps_freebayes)[10]:colnames(snps_freebayes)[ncol(snps_freebayes)], factor_key=TRUE,na.rm = TRUE) 
nrow(snps_freebayes_long)
table(snps_freebayes_long$sample)
###------------------------remove non-called snps
snps_freebayes_long_cleaned <- snps_freebayes_long[snps_freebayes_long$Snps!=".",]
nrow(snps_freebayes_long_cleaned)
# snps_freebayes_long[c(7438,7437,7439, 7440, 7441, 7442, 7443, 7444, 7445),]

###------------------------calculate allel frequency   

sample_relative_temp <-snps_freebayes_long_cleaned %>% separate("Snps", into=c("GT","DP","AD","RO","QR","AO","QA","GL"), sep = ":") %>% transform(.,allel_frequency=as.numeric(AO)/as.numeric(DP),QualityScore=as.numeric(QA)/as.numeric(DP))



##==================
##03_filter QA/DP > 30
##INFO=<ID=QA,Number=A,Type=Integer,Description="Alternate allele quality sum in phred">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total read depth at the locus"
##==================
sample_relative_temp_filter <- sample_relative_temp[which(!is.na(sample_relative_temp$QualityScore)),]

# tmp <- sample_relative_temp_filter[which(sample_relative_temp_filter$sample=="di_K2_6h"),]
# nrow(tmp)
# table(tmp$sample)
# tmp$QualityScore <- as.numeric(tmp$QualityScore)
# sum(tmp$QualityScore>5)
# hist(tmp$QualityScore)
# hist(sample_relative_temp_filter$QualityScore)

sample_relative_temp_withONT <-sample_relative_temp_filter[which(sample_relative_temp_filter$QualityScore>5),]
table(sample_relative_temp_withONT$sample)
# sample_relative_temp_withONT <- sample_relative_temp
sample_relative_temp_withONT$site <- paste(sample_relative_temp_withONT$X.CHROM,sample_relative_temp_withONT$POS,sample_relative_temp_withONT$REF,sample_relative_temp_withONT$ALT,sep="_")

sample_relative_temp <- sample_relative_temp_withONT
# sample_relative_temp <- sample_relative_temp_filter ### without QS>30

##==================
##03_filter
##==================

##remove SNPs that have more than two allel frequencies
# nrow(sample_relative_temp)
# nas <- sample_relative_temp[which(is.na(sample_relative_temp$allel_frequency)),]
# nrow(nas)
sample_relative_temp_cleaned <- sample_relative_temp[which(!is.na(sample_relative_temp$allel_frequency)),]
# nrow(sample_relative_temp_cleaned)
##remove low coverage (>30x)
cutoff <- 30
sample_relative_temp_cleaned$DP <- as.numeric(sample_relative_temp_cleaned$DP)
nrow(sample_relative_temp_cleaned)
sample_relative_safe <- sample_relative_temp_cleaned[which(sample_relative_temp_cleaned$DP>cutoff),]
nrow(sample_relative_safe)

##make all allele frequencies smaller than 0.05 to NA
# sample_relative_safe[sample_relative_safe$allel_frequency<0.05,"allel_frequency"] <- NA

#is.na(sample_relative_safe$allel_frequency)

##remove snps with less than 2x alternative allel frequency or zero
sample_relative_safe$AO <- as.numeric(sample_relative_safe$AO)
# nrow(sample_relative_safe[sample_relative_safe$AO<=2,])
# nrow(sample_relative_safe[sample_relative_safe$AO==0 | sample_relative_safe$AO>2,])
nrow(sample_relative_safe[sample_relative_safe$AO>0 & sample_relative_safe$AO<=2,])
sample_relative_safe <- (sample_relative_safe[sample_relative_safe$AO==0 | sample_relative_safe$AO>2,])
nrow(sample_relative_safe)


##==================
##04_preperation and make wide
##==================

sample_relative_temp_cleaned <- sample_relative_safe
#ggplot(sample_relative_temp_cleaned, aes(y=DP))+geom_boxplot()+ylim(c(0,1000))


##make site name
sample_relative_temp_cleaned$site <- paste(sample_relative_temp_cleaned$X.CHROM,sample_relative_temp_cleaned$POS,sample_relative_temp_cleaned$REF,sample_relative_temp_cleaned$ALT,sep="_")


###spread the dataframe again

nrow(sample_relative_temp_cleaned)

sample_relative_safe_final_prep <- sample_relative_temp_cleaned[,c("X.CHROM","POS","site","allel_frequency","sample")]
#nrow(sample_relative_safe_final_prep[grep("S_thermophilus_RMK202",sample_relative_safe_final_prep$site),])
sample_relative_safe_final_tsne <- spread(sample_relative_safe_final_prep, sample, allel_frequency)  %>%replace(is.na(.), 0)
nrow(sample_relative_safe_final_tsne)
table(sample_relative_safe_final_prep$sample)
##==================
##05_filter rowsums!=0
##==================

##remove row sums ==0
nrow(sample_relative_safe_final_tsne)
sum(rowSums(sample_relative_safe_final_tsne[,4:ncol(sample_relative_safe_final_tsne)],na.rm = TRUE)==0)


sample_relative_wide <- sample_relative_safe_final_tsne[rowSums(sample_relative_safe_final_tsne[,4:ncol(sample_relative_safe_final_tsne)],na.rm = TRUE)>0,]
table(sample_relative_wide$X.CHROM)
sample_relative_wide_phagesONly <- sample_relative_wide[grep("phage",sample_relative_wide$X.CHROM),]
# table(sample_relative_wide_phagesONly$X.CHROM)
# nrow(sample_relative_wide)
sample_relative_wide <- subset(sample_relative_wide, select=-POS)
sample_relative_wide$species <- sample_relative_wide$X.CHROM
sample_relative_wide$species <- revalue(sample_relative_wide$species, c("CP046131"="L. delbrueckii","CP046134"="S. thermophilus"))
sample_relative_wide$culture <- "RMK202"
nrow(sample_relative_wide)
##==================
##06_add gene information
##==================

RMK202_snpeff_eggnog_repeats_core <- read_csv("~/Desktop/Projects/2019_Pilotplan/04_mapping2ONT/SnpEff/new/RMK202_snpeff_eggnog_repeats_core.txt")
# nrow(RMK202_snpeff_eggnog_repeats_core)
# nrow(sample_relative_wide)
#RMK202_snpeff_eggnog_repeats_core_subset <- subset(RMK202_snpeff_eggnog_repeats_core, select=c(X.CHROM,site,finalName,effect,significance,geneName,GOs,EC,KEGG_ko,COG_Functional_Category,Repeat_identity,core))

sample_relative_wide_description <- merge(sample_relative_wide,RMK202_snpeff_eggnog_repeats_core, by = "site",all.x = TRUE)
# nrow(sample_relative_wide_description)

#---------------------------------subset for interesting columns
colnames(sample_relative_wide_description)
sample_relative_wide_description_interest <- subset(sample_relative_wide_description, select=c("X.CHROM.x","species.x","site","culture","finalName","Preferred_name","effect","significance","geneName","COG_Functional_Category","Repeat_cluster","core","L104","L108","L35","L44","L70","L71","L80","L99","S50","S72","Lyo_202_2012","Lyo_202_2014","Konserve_202","RMK202","Versand_202","lyo202_96","di_K2_6h","th_K2_8h","G1_6_18","G2_6_18","G3_6_18","G4_6_18","G5_6_18","12107","24776",
"24778",
"24779",
"24780",
"24781",
"24782",
"24783",
"24798",
"24777",
"13491",
"13492",
"13493",
"13494",
"13495",
"13496",
"13497",
"13498",
"13500",
"24737",
"24738",
"24739",
"24740",
"13499",
"24853",
"24854",
"24855"))

#"G4_6_18"

colnames(sample_relative_wide_description_interest)
nrow(sample_relative_wide_description_interest)
##==================
##07 rename samples
##==================

# colnames(sample_relative_wide_description_interest) <- revalue(colnames(sample_relative_wide_description_interest), c("species.x"="species","S50"="mst1","S72"="mst2","L70"="mst3", "L44"="mst10", "L108"="mst7", "L99"="mst8", "L104"="mst6", "L80"="mst5", "L35"="mst9", "L71"="mst4","lyo202_96"="Lyo\n1996","Lyo_202_2012"="Lyo\n2012", "Lyo_202_2014"="Lyo\n2014", "Konserve_202"="working\nstock", "RMK202"="starter\nculture\n2012", "Versand_202"="starter\nculture\n2018","di_K2_6h"="cheesemaking\nday1","th_K2_8h"="cheesemaking\nday2"))

colnames(sample_relative_wide_description_interest) <- revalue(colnames(sample_relative_wide_description_interest), c("species.x"="species","L70"="mst3", "L44"="mst10", "L108"="mst7", "L99"="mst8", "L104"="mst6", "L80"="mst5", "L35"="mst9", "L71"="mst4","lyo202_96"="Lyo\n1996","Lyo_202_2012"="Lyo\n2012", "Lyo_202_2014"="Lyo\n2014", "Konserve_202"="working\nstock", "RMK202"="starter\nculture\n2012", "Versand_202"="starter\nculture\n2018","di_K2_6h"="cheesemaking\nday1","th_K2_8h"="cheesemaking\nday2","G1_6_18"="experiment_A","G2_6_18"="experiment_B","G3_6_18"="experiment_C","G4_6_18"="experiment_D","G5_6_18"="experiment_E"))


##==================
##08_remove non-bacterial SNPs
##==================
table(sample_relative_wide_description_interest$X.CHROM.x)
nrow(sample_relative_wide_description_interest)
# sample_relative_wide_description_interest_subset <- sample_relative_wide_description_interest[which(sample_relative_wide_description_interest$X.CHROM.x %in% c("L_delbrueckii_RMK202","S_thermophilus_RMK202")),]
sample_relative_wide_description_interest_subset <- sample_relative_wide_description_interest[which(sample_relative_wide_description_interest$X.CHROM.x %in% c("CP046131","CP046134")),]
nrow(sample_relative_wide_description_interest_subset)
table(sample_relative_wide_description_interest_subset$X.CHROM.x)
sample_relative_wide <- sample_relative_wide_description_interest_subset


##==================
##09_order variables properly
##==================


sample_relative_wide$significance <- factor(sample_relative_wide$significance, levels=c("MODIFIER","LOW","MODERATE","HIGH"))


##==================
##10_dN_dS ratio
##==================
###-----------prep for 
prep_for_dN_dS_wide <- sample_relative_wide[which(sample_relative_wide$significance!="MODIFIER"),] 

###-----------calculate dS/dN ratio

dN_dS_ratios <- data.frame()
for (gene in unique(prep_for_dN_dS_wide$finalName)) {
  
  dS <- prep_for_dN_dS_wide %>% filter(finalName==gene) %>% filter(significance=="LOW")
   dN <-  prep_for_dN_dS_wide %>% filter(finalName==gene) %>% filter(significance %in% c("MODERATE","HIGH"))
tmp <- data.frame(gene, "dS"=nrow(dS),"dN"=nrow(dN),"dN_dN+dS"=(nrow(dN))/(nrow(dS)+nrow(dN)),"dN_dS"=(nrow(dN))/(nrow(dS)))
dN_dS_ratios <- rbind(dN_dS_ratios,tmp)
}

plotDens <- ggplot(dN_dS_ratios,aes(x=dN_dS))+geom_histogram()+
  geom_vline(xintercept=mean(dN_dS_ratios$dN_dS))+
  theme_classic()+
  labs(x="dN/(dN+dS)",
       y="gene count")

plotDens
dN_dS_ratios <- arrange(dN_dS_ratios,desc(dN_dS)) 

##-------------------------------prep snpeff
annotation_prep <- RMK202_snpeff_eggnog_repeats_core %>% select(-c(X1,X.CHROM,species,POS,REF,ALT,quality,site,effect,significance)) %>% distinct()
dN_dS_ratios_final <- merge(dN_dS_ratios,annotation_prep,by.x="gene",by.y="finalName",all.x = TRUE)
dN_dS_ratios_final_02 <- arrange(dN_dS_ratios_final,desc(dN_dS)) 

##-------------------------------wirte to file

# write.xlsx(dN_dS_ratios_final_02,file="Users//Desktop/presenations/my_presentations/20191007_groupmeeting/figures/dN_dS_ratios_genes_rmk202.xls", sheetName = "Sheet1", 
  # col.names = TRUE, row.names = FALSE, append = FALSE)

##-------------------------------merge with sample_relative_wide
dN_dS_ratios$mutations <- dN_dS_ratios$dS+dN_dS_ratios$dN
dN_dS_ratios_prep <- dN_dS_ratios %>% select(gene,dN_dN.dS,mutations)
sample_relative_wide <- merge(sample_relative_wide,dN_dS_ratios_prep,by.y="gene",by.x="finalName",all.x = TRUE)
# colnames(sample_relative_wide)
# RMK202_snpeff_eggnog_repeats_core_uniuqes <- sample_relative_wide %>%select(-c("X.CHROM.x","POS","REF","ALT","quality","site","effect","significance","best_tax_level")) %>% distinct()
##==================
##11_make long
##==================
colnames(sample_relative_wide)
sample_relative_long <- gather(sample_relative_wide, sample, Snps, "mst6":"24855", factor_key=TRUE,na.rm = TRUE) 
nrow(sample_relative_long)
table(sample_relative_long$X.CHROM)


##==================
##12_make a unique gene infomoration
##==================

RMK202_snpeff_eggnog_repeats_core <- read_csv("~/Desktop/Projects/2019_Pilotplan/04_mapping2ONT/SnpEff/new/RMK202_snpeff_eggnog_repeats_core.txt")
RMK202_snpeff_eggnog_repeats_core_uniuqes <- RMK202_snpeff_eggnog_repeats_core %>% select(-c("X1","X.CHROM","POS","REF","ALT","quality","site","effect","significance","best_tax_level")) %>% distinct()
RMK202_snpeff_eggnog_repeats_core_uniuqes_final <- merge(RMK202_snpeff_eggnog_repeats_core_uniuqes,dN_dS_ratios_prep,by.y="gene",by.x="finalName",all.x = TRUE)  

write.table(RMK202_snpeff_eggnog_repeats_core_uniuqes_final,quote=FALSE,sep="\t",file="~/Desktop/presenations/my_presentations/20191007_groupmeeting/figures/RMK202_snpeff_eggnog_repeats_core_dNdS_uniqueGenes.txt")

colnames(RMK202_snpeff_eggnog_repeats_core_uniuqes_final)
RMK202_snpeff_eggnog_repeats_core_uniuqes_final_reduce <- RMK202_snpeff_eggnog_repeats_core %>% select(-c("finalName","Preferred_name","EC","KEGG_ko","KEGG_Reaction","CAZy","BiGG_Reaction","COG_Functional_Category","eggNOG free text description")) %>% distinct()


###------------------------------------------
#wide only metagenomes
###------------------------------------------

table(sample_relative_long$sample)

sample_relative_long_meta <- sample_relative_long %>% filter(sample %in% c("Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","cheesemaking\nday1","cheesemaking\nday2","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E"))


sample_relative_wide_meta <- spread(sample_relative_long_meta, sample, Snps) %>%  filter(core=="core")#%>%replace(is.na(.), 0) 
##----------------
##stats
##----------------

table((sample_relative_wide_meta$core))
length((sample_relative_wide_meta$core))
table(sample_relative_wide_meta$species)
table(sample_relative_wide_meta$significance)

sample_relative_wide_meta$synonomous <- revalue(sample_relative_wide_meta$significance,c("MODIFIER"="synonymous","LOW"="synonymous","MODERATE"="non-synonymous","HIGH"="non-synonymous"))##add certain clusters to others
table(interaction(sample_relative_wide_meta$species,sample_relative_wide_meta$synonomous))

plotSNPs <- ggplot(sample_relative_wide_meta,aes(x=synonomous,fill=significance))+geom_bar()+theme_classic()+facet_wrap(~species,scales="free")+labs(x="",y="count",fill="SNP effect")+theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9))
plotSNPs


  # svg("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance_bac_snps.svg",width=5,height=4.5)
   png("~/Desktop/Manuscripts/2019_RMK202/Figures/supplement_SNP_effect.png", width = 1600, height = 1800,res=300)

plotSNPs

dev.off()


  ##============
### plot 
   ##============

all_colours <- rev(c("#EB4D4D","#10B552") ) 

colnames(sample_relative_safe_woSTRAINS)

table(sample_relative_safe_woSTRAINS$sample)
  ##============
### subset :
##----01==only genic mutations
##----02==only core genes mutations
##----03==only snps above 0.03
##----04==only moderate or high significance SNVs
   ##============
table(sample_relative_long$sample)
nrow(sample_relative_long)
sample_relative_safe_woSTRAINS_sub01 <- sample_relative_long
# sample_relative_safe_woSTRAINS_sub01 <- sample_relative_safe_woSTRAINS[which(sample_relative_safe_woSTRAINS$significance!="MODIFIER"),]
nrow(sample_relative_safe_woSTRAINS_sub01)
sample_relative_safe_woSTRAINS_sub02 <- sample_relative_safe_woSTRAINS_sub01
sample_relative_safe_woSTRAINS_sub02 <- sample_relative_safe_woSTRAINS_sub01[which(sample_relative_safe_woSTRAINS_sub01$core=="core"),]
table(sample_relative_safe_woSTRAINS_sub02$sample)
sample_relative_safe_woSTRAINS_sub03 <- sample_relative_safe_woSTRAINS_sub02[which(sample_relative_safe_woSTRAINS_sub02$Snps>0.03),]
# sample_relative_safe_woSTRAINS_sub05 <- sample_relative_safe_woSTRAINS_sub03

sample_relative_safe_woSTRAINS_sub04 <- sample_relative_safe_woSTRAINS_sub02[which(sample_relative_safe_woSTRAINS_sub02$Snps<=0.03),]
sample_relative_safe_woSTRAINS_sub04$Snps <- 0
nrow(sample_relative_safe_woSTRAINS_sub04)
sample_relative_safe_woSTRAINS_sub05 <- rbind(sample_relative_safe_woSTRAINS_sub03,sample_relative_safe_woSTRAINS_sub04)
sample_relative_safe_woSTRAINS_sub06 <- sample_relative_safe_woSTRAINS_sub05[which(sample_relative_safe_woSTRAINS_sub05$significance %in% c("MODERATE","HIGH")),]

nrow(sample_relative_safe_woSTRAINS_sub05)
table(sample_relative_safe_woSTRAINS_sub05$sample)

sample_relative_safe_Sterm_final <- sample_relative_safe_woSTRAINS_sub06[which(sample_relative_safe_woSTRAINS_sub06$species=="S. thermophilus"),]
table(sample_relative_safe_Sterm_final$sample)
sample_relative_safe_Sterm_final_wide <- spread(sample_relative_safe_Sterm_final, sample, Snps)  %>%replace(is.na(.), 0)

table(sample_relative_safe_Sterm_final_wide$sample)
# length(unique(sample_relative_safe_Sterm_final$site))
  ##============
### plot Streptococcus thermophius
   ##============



colnames(sample_relative_safe_Sterm_final)
table(sample_relative_safe_Sterm_final$sample)

# sample_relative_safe_woSTRAINS <- sample_relative_safe_Sterm_final[grep("^mst",sample_relative_safe_Sterm_final$sample,invert = TRUE),] 
sample_relative_safe_Sterm_final$sample <- revalue(sample_relative_safe_Sterm_final$sample, c("cheesemaking\nday2"="Reference 2","cheesemaking\nday1"="Reference 1"))
# sample_relative_safe_woSTRAINS$sample <- revalue(sample_relative_safe_woSTRAINS$sample, c("Reference 2"="cheesemaking\nday2","Reference 1"="cheesemaking\nday1"))
table(sample_relative_safe_Sterm_final$sample)

sample_relative_safe_woSTRAINS <- sample_relative_safe_Sterm_final %>% filter(sample %in% c("Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","cheesemaking\nday1","cheesemaking\nday2","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E","Reference 1","Reference 2"))


sample_relative_safe_woSTRAINS$sample <- droplevels(sample_relative_safe_woSTRAINS$sample)
sample_relative_safe_woSTRAINS$sample = factor(sample_relative_safe_woSTRAINS$sample, levels=c("Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E","Reference 1","Reference 2"))

levels(sample_relative_safe_woSTRAINS$sample)
table(sample_relative_safe_woSTRAINS$sample)

  sample_relative_safe_woSTRAINS <- sample_relative_safe_woSTRAINS %>% filter(sample!="Reference 1")  %>% filter(sample!="Reference 2")


  p3 <- ggplot(sample_relative_safe_woSTRAINS,aes(x=sample,y=Snps,group=site,color=species,fill=species, text =paste("effect:", effect,"\nsignficance:",significance,"\ngeneName:",geneName,"\nRepeat_identity:",Repeat_cluster,"\ncore:",core,"\nCOG:",COG_Functional_Category)))+ geom_line(size=0.2, alpha=.1)+
    # facet_grid(day~kessel~species)+
    # facet_grid(species~.)+
    # facet_grid(treatment~species)+
    # facet_grid(treatment~species)+
    labs("",
         x="",
         y="Alternative allele frequency")+
    #scale_x_continuous(breaks =c(0,2,4,6,8,10,12,24),labels=c(0,2,4,6,8,10,12,24))+
    #scale_y_continuous(limits = c(0,0.3))+
    theme_classic()+
   scale_fill_manual(values="#0081a7")+
   scale_color_manual(values="#0081a7")+
       # scale_color_manual(values=all_colours[2])+
    scale_x_discrete( expand = c(0, 0)) +
    theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9),
          rect = element_rect(fill = "transparent"), # all rectangles      #axis.text.x = element_blank(),
      #axis.text.x = element_blank(),
          legend.position="none"
          #legend.justification=c(1,1), legend.position=c(1,1),
          #legend.title = element_blank()
          )

#   svg("~/Desktop/presenations/my_presentations/20190919_VUA/abundance_snps.png", width = 1900, height = 1200,res=300)
# #
p3
#
# dev.off()
  svg("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance_bac_snps_sterm.svg",width=6,height=5)
   # png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 1800, height = 1200,res=300)
# 
# p3+theme(axis.title = element_blank(),axis.text.x = element_blank())
p3

dev.off()




  ##============
### plot Lactobacillus delbrueckii
   ##============


nrow(sample_relative_long)
sample_relative_safe_woSTRAINS_sub01 <- sample_relative_long
# sample_relative_safe_woSTRAINS_sub01 <- sample_relative_safe_woSTRAINS[which(sample_relative_safe_woSTRAINS$significance!="MODIFIER"),]
nrow(sample_relative_safe_woSTRAINS_sub01)
table(sample_relative_safe_woSTRAINS_sub01$species)

sample_relative_safe_woSTRAINS_sub02 <- sample_relative_safe_woSTRAINS_sub01
sample_relative_safe_woSTRAINS_sub02 <- sample_relative_safe_woSTRAINS_sub01[which(sample_relative_safe_woSTRAINS_sub01$core=="core"),]
nrow(sample_relative_safe_woSTRAINS_sub02)
table(sample_relative_safe_woSTRAINS_sub02$species)
sample_relative_safe_woSTRAINS_sub03 <- sample_relative_safe_woSTRAINS_sub01[which(sample_relative_safe_woSTRAINS_sub01$Snps>0.03),]
# sample_relative_safe_woSTRAINS_sub05 <- sample_relative_safe_woSTRAINS_sub03
table(sample_relative_safe_woSTRAINS_sub03$species)
sample_relative_safe_woSTRAINS_sub04 <- sample_relative_safe_woSTRAINS_sub03 %>% filter(species=="L. delbrueckii") 
unique(sample_relative_safe_woSTRAINS_sub04$site) %>% length()
sample_relative_safe_woSTRAINS_sub03 <- sample_relative_safe_woSTRAINS_sub02[which(sample_relative_safe_woSTRAINS_sub02$Snps<=0.03),]
sample_relative_safe_woSTRAINS_sub04$Snps <- 0
table(sample_relative_safe_woSTRAINS_sub04$species)

nrow(sample_relative_safe_woSTRAINS_sub04)
sample_relative_safe_woSTRAINS_sub05 <- rbind(sample_relative_safe_woSTRAINS_sub03,sample_relative_safe_woSTRAINS_sub04)
sample_relative_safe_woSTRAINS_sub05 <- sample_relative_safe_woSTRAINS_sub04

sample_relative_safe_woSTRAINS_sub06 <- sample_relative_safe_woSTRAINS_sub05[which(sample_relative_safe_woSTRAINS_sub05$significance %in% c("MODERATE","HIGH")),]

sample_relative_safe_woSTRAINS_sub06 <- sample_relative_safe_woSTRAINS_sub04

sample_relative_safe_woSTRAINS_sub06 <- sample_relative_long
table(sample_relative_safe_woSTRAINS_sub06$species)

sample_relative_safe_Sterm_final_ldel <- sample_relative_safe_woSTRAINS_sub06[which(sample_relative_safe_woSTRAINS_sub06$species!="S. thermophilus"),]
table(sample_relative_safe_Sterm_final_ldel$species)%>%replace(is.na(.), 0)

sample_relative_safe_Sterm_final_wide <- spread(sample_relative_safe_Sterm_final_ldel, sample, Snps)  

sample_relative_safe_woSTRAINS

table(sample_relative_safe_woSTRAINS_sub06$species)

sample_relative_safe_woSTRAINS <- sample_relative_safe_Sterm_final[grep("^mst",sample_relative_safe_Sterm_final$sample,invert = TRUE),] 


sample_relative_safe_woSTRAINS <- sample_relative_safe_woSTRAINS_sub06 %>% filter(sample %in% c("Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","cheesemaking\nday1","cheesemaking\nday2","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E"))


sample_relative_safe_woSTRAINS$sample <- droplevels(sample_relative_safe_woSTRAINS$sample)
sample_relative_safe_woSTRAINS$sample = factor(sample_relative_safe_woSTRAINS$sample, levels=c("Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","cheesemaking\nday1","cheesemaking\nday2","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E"))

levels(sample_relative_safe_woSTRAINS$sample)
table(sample_relative_safe_woSTRAINS$sample)

  sample_relative_safe_woSTRAINS <- sample_relative_safe_woSTRAINS %>% filter(sample!="cheesemaking\nday1")  %>% filter(sample!="cheesemaking\nday2")
all_colours <- rev(c("#EB4D4D","#10B552") ) 

table(sample_relative_safe_woSTRAINS$species)

sample_relative_safe_woSTRAINS <- sample_relative_safe_woSTRAINS[which(sample_relative_safe_woSTRAINS$species!="S. thermophilus"),]


  p3 <- ggplot(sample_relative_safe_woSTRAINS,aes(x=sample,y=Snps,group=site,color=species,fill=species, text =paste("effect:", effect,"\nsignficance:",significance,"\ngeneName:",geneName,"\nRepeat_identity:",Repeat_cluster,"\ncore:",core,"\nCOG:",COG_Functional_Category)))+ geom_line(size=0.5, alpha=0.5)+
    # facet_grid(day~kessel~species)+
    # facet_grid(species~.)+
    # facet_grid(treatment~species)+
    # facet_grid(treatment~species)+
    labs("",
         x="",
         y="Alternative allele frequency")+
    #scale_x_continuous(breaks =c(0,2,4,6,8,10,12,24),labels=c(0,2,4,6,8,10,12,24))+
    #scale_y_continuous(limits = c(0,0.3))+
    theme_classic()+
   scale_fill_manual(values=all_colours)+
   scale_color_manual(values=all_colours)+
    scale_x_discrete( expand = c(0, 0)) +
    theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9),
          rect = element_rect(fill = "transparent"), # all rectangles      #axis.text.x = element_blank(),
      #axis.text.x = element_blank(),
          legend.position="none"
          #legend.justification=c(1,1), legend.position=c(1,1),
          #legend.title = element_blank()
          )

#   svg("~/Desktop/presenations/my_presentations/20190919_VUA/abundance_snps.png", width = 1900, height = 1200,res=300)
# #
p3


 svg("~/Desktop/mid_thesis/report/figures/20200430/chapter1/F1_relativeAbundance_bac_snps_Ldel.svg",width=6,height=5)
   # png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 1800, height = 1200,res=300)
# 
# p3+theme(axis.title = element_blank(),axis.text.x = element_blank())
p3

dev.off()

1.3.2 Phylogeny

This is Kirstin’s approach based on orthofinder

Described in more detail here

species=Sterm
species=Ldel

for species in $(echo "Ldel Sterm")
do
rm -r /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FAA
mkdir -p /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FAA

cp /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/${species}/FAA_all/* /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FAA/

#cp /data/Project/2020_StarterCultureDiversity/10_REFERENCE_GEN0MES/genomes/NCBI_Refs/${species}_references/PROKKA_FAA/* /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FAA/

cp /data/Project/2020_StarterCultureDiversity/10_REFERENCE_GEN0MES/genomes/NCBI_Refs/${species}_references/PROKKA_FAA/L-DSM-2007.faa /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FAA/
echo "==========================================================="
ll /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FAA/
done


###-----------------------------------------
##orthorfinder
###-----------------------------------------

species=Ldel
for species in $(echo "Ldel Sterm")
do
mkdir -p /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/
rm -r /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}
#mkdir -p  /data/Project/2020_StarterCultureDiversity/11_referenceTree/Orthofinder_combined_NCBI_own/${species}/

    orthofinder -f /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FAA/ \
      -t 35 \
      -o /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species} \
      -a 35

done

Here, I make a directory containing the fna and faa files of all genomes with the file name corresponding to the locus tag.




for species in $(echo "Ldel Sterm")
#for species in $(echo "Sterm")
do 
date=$(ls /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species})


#rm -r /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Sterm_references/aligned_combined_ncbi_own/FAA/
#mkdir -p  /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Sterm_references/aligned_combined_ncbi_own/FAA/

rm -r /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FFN/
mkdir -p  /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FFN/

leterzzz=$(echo $species |head -c 1)

#cp  /data/Project/2020_StarterCultureDiversity/10_REFERENCE_GEN0MES/genomes/NCBI_Refs/${species}_references/PROKKA_FAA/* /data/Project/2020_StarterCultureDiversity/10_REFERENCE_GEN0MES/genomes/combined_ncbi_own/FAA/${species}/

#cp /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/${species}/FAA_all/* /data/Project/2020_StarterCultureDiversity/10_REFERENCE_GEN0MES/genomes/combined_ncbi_own/FAA/${species}/

#cp  /data/Project/2020_StarterCultureDiversity/10_REFERENCE_GEN0MES/genomes/NCBI_Refs/${species}_references/PROKKA_FFN/* /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FFN/
cp  /data/Project/2020_StarterCultureDiversity/10_REFERENCE_GEN0MES/genomes/NCBI_Refs/${species}_references/PROKKA_FFN/L-DSM-2007.ffn /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FFN/
cp /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/${species}/FFN_all/* /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FFN/


done

###--------------------
for species in $(echo "Ldel Sterm")
#for species in $(echo "Sterm")
do
rm -r /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Genomes_FAA_FNA/
mkdir -p /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Genomes_FAA_FNA/
for genomesss in $(ls /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FAA/ |grep ".faa$"|sed 's/.faa//g')
do
#locusTagsss=$(head -1 /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FAA/${genomesss}.faa|cut -d '_' -f 1|sed 's/>//g')
locusTagsss=$(head -1 /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FAA/${genomesss}.faa|cut -d ' ' -f 1| sed 's/_000.*$//g'|sed 's/>//g')
echo -e "This  genome : " ${genomesss} " has the following locus Tag : "${locusTagsss}

cat /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FAA//${genomesss}.faa > \
/home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Genomes_FAA_FNA//${locusTagsss}.faa

cat /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/${species}_references/FFN//${genomesss}.ffn > \
/home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Genomes_FAA_FNA/${locusTagsss}.ffn


done
done

##------=================================================================
#extract orthogroups
##------=================================================================

for species in $(echo "Ldel Sterm")
#for species in $(echo "Sterm")
do
cd /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Genomes_FAA_FNA/
perl /data/Project/2020_StarterCultureDiversity/99_log/aln_aa_to_dna/vincent/extract_orthologs.pl  /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Orthogroups/Orthogroups_SCOG.txt --folder ${species}

done

align protein families

here I back translate the faa alignment to dna.

With perl: prune alignments (removing columns represented by less than 50% of the sequences)



for species in $(echo "Ldel Sterm")
#for species in $(echo "Sterm")
do
date=$(ls /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species})

cd /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Genomes_FAA_FNA/${species}

rm *_prune.fasta
perl /data/Project/2020_StarterCultureDiversity/99_log/aln_aa_to_dna/vincent/prune_aln.pl

done

sed -i 's/L-I-202-//g' /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Genomes_FAA_FNA/${species}/*_aln_prune.fasta

##================================================================
#remove all infomration (except genome info) from multifasta header
##================================================================

for species in $(echo "Ldel Sterm"|tail -1)
do
date=$(ls /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species})

cd /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Genomes_FAA_FNA/${species}
for genesss in $(ls |grep "_aln_prune.fasta$" |sed 's/_aln_prune.fasta//g' )
do
sed -i "s/_0.*//" ${genesss}_aln_prune.fasta 

done
done

##================================================================
#shorten names
##================================================================

species=Sterm
cd /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/Genomes_FAA_FNA/${species}

for genesss in $(ls |grep "_aln_prune.fasta$" |sed 's/_aln_prune.fasta//g' )
do
sed -i 's/202-13499c/202-13499/g' ${genesss}_aln_prune.fasta 

done

With perl: Concatenate pruned alignments

With RAxML: infer the phylogeny

rm -r /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/raxml_all_combined_NCBI_own/
mkdir -p /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/raxml_all_combined_NCBI_own/

for species in $(echo "Ldel Sterm")
#for species in $(echo "Sterm")
  do
  echo -e ${species}
  rm -r /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/raxml_all_combined_NCBI_own//${species}
  mkdir -p /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/raxml_all_combined_NCBI_own//${species}
cd /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/raxml_all_combined_NCBI_own//${species}
  date=$(ls /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species})

#rm -r *${species}_all*

/home/vincent/anaconda3/bin/raxmlHPC-PTHREADS-AVX2 -f a -x 12345 -p 12345 -# 100 -m GTRCAT -s /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/Orthofinder/${species}/${date}/${species}.phylip -n ${species}_all -T 37
done

##---------------------
##change some names
##---------------------

sed 's/(S/(S-/g' /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/raxml_all_combined_NCBI_own/RAxML_bipartitions.Sterm_all |sed 's/,S/,S-/g'|sed 's/S--/S-/g' > /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/raxml_all_combined_NCBI_own/RAxML_bipartitions.Sterm_all_new


#/home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/09_referenceTree/raxml_all_combined_NCBI_own/Ldel/RAxML_bipartitions.Ldel_all

find close samples

make pyhlip to aligned multifasta file

1.3.3 strain count

Here, I evaulate how abundant the S.thermophilus strains are

library(robustbase)
library(VennDiagram)
##==================================================Streptococcus thermophilus===================================================================
colnames(sample_relative_wide)
sample_relative_wide_snpsUsed <- sample_relative_wide  %>%replace(is.na(.), 0)  %>% filter(core=="core") %>% filter(species=="S. thermophilus")    %>% filter( "Lyo\n1996">0.05| "Lyo\n2012">0.05| "Lyo\n2014">0.05| "working\nstock">0.05 | "starter\nculture\n2012">0.05 | "starter\nculture\n2018">0.05 | "experiment_A">0.05| "experiment_B">0.05| "experiment_C">0.05 | "experiment_D">0.05 | "experiment_E">0.05)

colnames(sample_relative_wide_snpsUsed)
sample_relative_wide_snpsUsed_lin1 <- sample_relative_wide_snpsUsed %>% filter(`24853`>0.8| `24798`>0.8| `13493`>0.8| `13500`>0.8 | `24737`>0.8 | `24854`>0.8| `13491`>0.8 | `13492`>0.8 ) %>% add_column(explained="lin1")
sample_relative_wide_snpsUsed_lin2 <- sample_relative_wide_snpsUsed %>% filter(`S72`>0.8| `24855`>0.8| `13494`>0.8)%>% add_column(explained="lin2")
sample_relative_wide_snpsUsed_lin3 <- sample_relative_wide_snpsUsed %>% filter(`S50`>0.8| `24740`>0.8| `24738`>0.8| `13499`>0.8 )%>% add_column(explained="lin3")
sample_relative_wide_snpsUsed_lin4 <- sample_relative_wide_snpsUsed %>% filter(`24739`>0.8| `13497`>0.8| `13496`>0.8| `13495`>0.8 )%>% add_column(explained="lin4")
allsites <- c(sample_relative_wide_snpsUsed_lin1$site, sample_relative_wide_snpsUsed_lin2$site, sample_relative_wide_snpsUsed_lin3$site,sample_relative_wide_snpsUsed_lin4$site)

##----------------------
# Chart venn diagramm
##----------------------
temp <-venn.diagram(
  x = list(sample_relative_wide_snpsUsed_lin1$site, sample_relative_wide_snpsUsed_lin2$site, sample_relative_wide_snpsUsed_lin3$site,sample_relative_wide_snpsUsed_lin4$site),
  category.names = c("lin 1" , "lin 2 " , "lin 3", "lin 4"),
  filename = NULL
)

plot.new() 

grid.draw(temp)
# 
# pdf("testpdf", width = 14, height = 7)
# 
# grid.draw(temp)
# 
# dev.off()

grid.draw(temp)

allsites[duplicated(allsites)]
sum(duplicated(allsites))
sum(!duplicated(allsites))

sum(table(allsites)>1)
sum(table(allsites)==1)

duplictednames <- names(table(allsites)[(table(allsites)>1)])

##----------------------
# Chart venn diagramm
##----------------------

sample_relative_wide_snpsUsed_new_01 <- rbind(sample_relative_wide_snpsUsed_lin1,sample_relative_wide_snpsUsed_lin2,sample_relative_wide_snpsUsed_lin3,sample_relative_wide_snpsUsed_lin4)
sample_relative_wide_snpsUsed_new_02 <- sample_relative_wide_snpsUsed_new_01 %>% filter(!site %in%duplictednames) #lineage specific snps
table(sample_relative_wide_snpsUsed_new_02$explained)
sample_relative_wide_snpsUsed_new_temp <- sample_relative_wide_snpsUsed %>% filter(site %in%duplictednames)%>% add_column(explained="multiple") ##duplicated sites
sample_relative_wide_snpsUsed_new_03 <- rbind(sample_relative_wide_snpsUsed_new_02,sample_relative_wide_snpsUsed_new_temp) ##all explained sites
sample_relative_wide_snpsUsed_tmp <-  sample_relative_wide_snpsUsed %>% filter(!site %in%sample_relative_wide_snpsUsed_new_03$site) %>% add_column(explained="not explained")
sample_relative_wide_snpsUsed_final <- rbind(sample_relative_wide_snpsUsed_new_03,sample_relative_wide_snpsUsed_tmp) ##all explained sites


##EXKURSION LINEAGE SPECIFIC DUPLICATIONS
TMP <- sample_relative_wide_snpsUsed %>% filter(site %in%duplictednames)
TMP$e

short_explained <- sample_relative_wide_snpsUsed_new_01 %>%filter(site %in%duplictednames) %>%  select(c("site","species","explained")) %>% mutate(abundance = explained) %>% spread(., explained, abundance)
short_explained$explained <- paste(short_explained$lin1,short_explained$lin2,short_explained$lin3,short_explained$lin4,sep="_")
short_explained_multiple <- short_explained%>% select("site","explained")
sample_relative_wide_snpsUsed_new_temp <- sample_relative_wide_snpsUsed %>% filter(site %in%duplictednames) ##duplicated sites
sample_relative_wide_snpsUsed_new_temp_02 <- merge(sample_relative_wide_snpsUsed_new_temp,short_explained_multiple,by="site")
sample_relative_wide_snpsUsed_new_03 <- rbind(sample_relative_wide_snpsUsed_new_02,sample_relative_wide_snpsUsed_new_temp_02) ##all explained sites
sample_relative_wide_snpsUsed_tmp <-  sample_relative_wide_snpsUsed %>% filter(!site %in%sample_relative_wide_snpsUsed_new_03$site) %>% add_column(explained="not explained")
sample_relative_wide_snpsUsed_final <- rbind(sample_relative_wide_snpsUsed_new_03,sample_relative_wide_snpsUsed_tmp) ##all explained sites


totalSNPS <- nrow(sample_relative_wide_snpsUsed_final)
100*(table(sample_relative_wide_snpsUsed_final$explained)/totalSNPS)
table(sample_relative_wide_snpsUsed_final$explained)
##----------------------
# boxplot
##----------------------
# sample_relative_wide_snpsUsed_final_long <- sample_relative_wide_snpsUsed_final %>% select( "site","Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E","explained")%>% gather(.,sample,Median,"Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E") 

sample_relative_wide_snpsUsed_final <- sample_relative_wide_snpsUsed_final %>%  dplyr::rename("Reference 2"="cheesemaking\nday2","Reference 1"="cheesemaking\nday1" )

sample_relative_wide_snpsUsed_final_long <- sample_relative_wide_snpsUsed_final %>% select( "site","Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E","Reference 1","Reference 2","explained")%>% gather(.,sample,Median,"Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E","Reference 1","Reference 2") 

# ggplot(sample_relative_wide_snpsUsed_final_long,aes(x=sample,y=Median))+geom_boxplot()+theme_classic()+facet_wrap(~explained)

#remove zeroes
sample_relative_wide_snpsUsed_final_long_woZeros <- sample_relative_wide_snpsUsed_final_long %>% filter(Median>0.1)
ggplot(sample_relative_wide_snpsUsed_final_long_woZeros,aes(x=sample,y=Median))+geom_boxplot()+theme_classic()+facet_wrap(~explained)


##----------------------
# lineplot
##----------------------
sample_relative_wide_snpsUsed_final_long_woZeros$sample = factor(sample_relative_wide_snpsUsed_final_long_woZeros$sample, levels=c("Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","cheesemaking\nday1","cheesemaking\nday2","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E","Reference 1","Reference 2"))

sample_relative_wide_snpsUsed_final_long_woZeros$explained = factor(sample_relative_wide_snpsUsed_final_long_woZeros$explained, levels=c("lin1","lin2","lin3","lin4","not explained","lin1_lin2_NA_NA","NA_NA_lin3_lin4","lin1_lin2_lin3_lin4","lin1_lin2_NA_lin4","lin1_NA_lin3_lin4","lin1_NA_lin3_NA","lin1_NA_NA_lin4","NA_lin2_lin3_NA","NA_lin2_NA_lin4","NA_lin2_lin3_lin4"))


# ggplot(sample_relative_wide_snpsUsed_final_long_woZeros,aes(x=sample,y=Median,group=site))+geom_line(color="red",size=0.2, alpha=.1)+theme_classic()
sample_relative_wide_snpsUsed_final_long_woZeros <- sample_relative_wide_snpsUsed_final_long_woZeros %>% filter(explained!="not explained")

pExplainted <- ggplot(sample_relative_wide_snpsUsed_final_long_woZeros,aes(x=sample,y=Median,group=site,color=explained,fill=explained))+geom_line(alpha=.5,size=0.5)+theme_classic()+facet_wrap(~explained)+theme(legend.position = "none",axis.text.x = element_text(angle = 75, hjust = 1,size=9))

table(sample_relative_wide_snpsUsed_final_long_woZeros$explained)
pExplainted


  svg("~/Desktop/mid_thesis/report/figures/20200430/chapter1/Supplement_SNVs_explained_by_isolates.svg",width=7,height=4.5)
# png("~/Desktop/Projects/2019_RMK202_analysis/plot/supp_altNucFre_lineages.png", width = 3400, height = 2600,res=300)
# 
# p3+theme(axis.title = element_blank(),axis.text.x = element_blank())
pExplainted
dev.off()
###--------------------
##subset for ppx
table(sample_relative_wide_snpsUsed_final_long_woZeros$explained)
# sample_relative_wide_snpsUsed_final_long_tmp <-  sample_relative_wide_snpsUsed_final_long_woZeros %>% filter(explained %in% c("NA_NA_lin3_lin4","NA_lin2_lin3_lin4","NA_lin2_NA_lin4","lin3"))

sample_relative_wide_snpsUsed_final_long_tmp <-  sample_relative_wide_snpsUsed_final_long_woZeros %>% filter(explained %in% c("lin1_NA_NA_lin4","NA_lin2_lin3_NA","NA_lin2_NA_lin4","lin1_NA_lin3_NA"))


  sample_relative_wide_snpsUsed_final_long_tmp$explained = factor(sample_relative_wide_snpsUsed_final_long_tmp$explained, levels=c("lin1_NA_NA_lin4","NA_lin2_lin3_NA","NA_lin2_NA_lin4","lin1_NA_lin3_NA"))

# sample_relative_wide_snpsUsed_final_long$explained_02 <- revalue(sample_relative_wide_snpsUsed_final_long$explained, c("lin1"="explained by isolates", "lin2"="explained by isolates","lin3"="explained by isolates","lin4"="explained by isolates","multiple"="explained by isolates","not explained"="not explained by isolates"))

pExplainted <- ggplot(sample_relative_wide_snpsUsed_final_long_tmp,aes(x=sample,y=Median,group=site,color=explained,fill=explained))+geom_line(alpha=.5,size=0.5)+theme_classic()+facet_wrap(~explained,ncol=4)+theme(legend.position = "none",axis.text.x = element_text(angle = 75, hjust = 1,size=6))

pExplainted

png("~/Desktop/Projects/2019_RMK202_analysis/plot/supp_altNucFre_lineages_03.png", width = 3400, height = 1000,res=300)
# 
# p3+theme(axis.title = element_blank(),axis.text.x = element_blank())
pExplainted
dev.off()

###--------------------

sample_relative_wide_snpsUsed_final_long$explained_02 <- revalue(sample_relative_wide_snpsUsed_final_long$explained, c("lin1"="explained by isolates", "lin2"="explained by isolates","lin3"="explained by isolates","lin4"="explained by isolates","multiple"="explained by isolates","not explained"="not explained by isolates"))

pExplainted <- ggplot(sample_relative_wide_snpsUsed_final_long,aes(x=sample,y=Median,group=site,color=explained_02,fill=explained_02))+geom_line(size=0.5,alpha=0.5)+theme_classic()+facet_wrap(~explained_02)+theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9),legend.position = "none")+
    labs("",
         x="",
         y="Alternative allele frequency")
pExplainted

pExplainted <- ggplot(sample_relative_wide_snpsUsed_final_long,aes(x=sample,y=Median,group=site,color=explained_02,fill=explained_02))+geom_line(size=0.5,alpha=0.5)+theme_classic()+facet_wrap(~explained_02)+theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9),legend.position = "none")+
    labs("",
         x="",
         y="Alternative allele frequency")
pExplainted


  svg("~/Desktop/mid_thesis/report/figures/20200430/chapter1/Supplement_SNVs_explained_by_isolates..svg",width=7,height=4.5)
#    # png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 1000, height = 800,res=300)
# 
# p3+theme(axis.title = element_blank(),axis.text.x = element_blank())
pExplainted

dev.off()



##----------------------
# not explained
##----------------------

sample_relative_wide_snpsUsed_final_long$explained_03 <- ifelse(sample_relative_wide_snpsUsed_final_long$explained_02=="not explained by isolates","not explained by isolates","explained")
  
  sample_relative_wide_snpsUsed_final_long$sample = factor(sample_relative_wide_snpsUsed_final_long$sample, levels=c("Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","cheesemaking\nday1","cheesemaking\nday2","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E"))

  
pExplainted_rough <- ggplot(sample_relative_wide_snpsUsed_final_long,aes(x=sample,y=Median,group=site,color=explained_03,fill=explained_03))+geom_line(size=0.5,alpha=0.5)+theme_classic()+facet_wrap(~explained_03)+theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9),legend.position = "none")+
    labs("",
         x="",
         y="Alternative allele frequency")
pExplainted_rough

  svg("~/Desktop/mid_thesis/report/figures/20200430/chapter1/Supplement_SNVs_notExplained.svg",width=7,height=4.5)
#    # png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 1000, height = 800,res=300)
# 
# p3+theme(axis.title = element_blank(),axis.text.x = element_blank())
pExplainted_rough

dev.off()

##----------------------
# where do these funky linkes (~recombinatnts) locate on the genome
##----------------------

sample_relative_wide_snpsUsed_final_long_recombinants <- sample_relative_wide_snpsUsed_final_long %>% filter(explained %in% c("lin1_lin2_NA_lin4","lin1_NA_lin3_lin4","lin1_NA_lin3_NA","lin1_NA_NA_lin4","NA_lin2_lin3_NA","NA_lin2_NA_lin4","NA_lin2_lin3_lin4")) %>% select("site","explained") %>% unique() 

sample_relative_wide_snpsUsed_final_long_recombinants <- sample_relative_wide_snpsUsed_final_long %>% select("site","explained") %>% unique() 


sample_relative_wide_snpsUsed_final_long_recombinants$location <- as.numeric(str_split_fixed(sample_relative_wide_snpsUsed_final_long_recombinants$site, "_", 4)[,2])
sample_relative_wide_snpsUsed_final_long_recombinants$explained = factor(sample_relative_wide_snpsUsed_final_long_recombinants$explained, levels=c("lin1","lin2","lin3","lin4","not explained","lin1_lin2_NA_NA","NA_NA_lin3_lin4","lin1_lin2_lin3_lin4","NA_lin2_lin3_lin4","lin1_lin2_NA_lin4","lin1_NA_lin3_lin4","lin1_NA_lin3_NA","lin1_NA_NA_lin4","NA_lin2_lin3_NA","NA_lin2_NA_lin4"))


sample_relative_wide_snpsUsed_final_long_recombinants$explained_2 <- revalue(sample_relative_wide_snpsUsed_final_long_recombinants$explained, c("lin1_lin2_NA_NA"="lin1_lin2_NA_NA=~evolved_after_split","NA_NA_lin3_lin4"="NA_NA_lin3_lin4=~evolved_after_split","lin1_lin2_lin3_lin4"="lin1_lin2_lin3_lin4=polishingERRORs","NA_lin2_lin3_lin4"="NA_lin2_lin3_lin4=lost_in_lin1","lin1_NA_lin3_lin4"="lin1_NA_lin3_lin4=lost_in_lin2","lin1_lin2_NA_lin4"="lin1_lin2_NA_lin4=lost_in_lin3","lin1_NA_lin3_NA"="lin1_NA_lin3_NA=recombination","lin1_NA_NA_lin4"="lin1_NA_NA_lin4=recombination","NA_lin2_lin3_NA"="NA_lin2_lin3_NA=recombination","NA_lin2_NA_lin4"="NA_lin2_NA_lin4=recombination"))

# ggplot(sample_relative_wide_snpsUsed_final_long_recombinants,aes(x=location,color=explained,fill=explained))+geom_density()+facet_wrap(~explained,scales = "free_y")+theme_classic()
plocation <- ggplot(sample_relative_wide_snpsUsed_final_long_recombinants,aes(x=location,color=explained,fill=explained))+geom_histogram()+facet_wrap(~explained_2,scales = "free_y")+theme_classic()+labs(title="location of SNVs coming from different ",x="genomic location")+theme(legend.position = "none")

plocation


  # svg("~/Desktop/mid_thesis/report/figures/20200430/chapter1/Supplement_SNVs_explained_by_isolates..svg",width=7,height=4.5)
png("~/Desktop/supp_altNucFre_lineages_location.png", width = 3400, height = 2600,res=300)
# 
# p3+theme(axis.title = element_blank(),axis.text.x = element_blank())
plocation
dev.off()

##----------------------
# not explained
##----------------------
table(sample_relative_wide_snpsUsed_final_long$explained)
sample_relative_wide_snpsUsed_final_long_notExplained <- sample_relative_wide_snpsUsed_final_long %>% filter(explained=="not explained")

ggplot(sample_relative_wide_snpsUsed_final_long_notExplained,aes(x=Median))+geom_density()+facet_wrap(~sample)+theme_classic()


###nomany multiallelic sites
sitesss <- str_split_fixed(sample_relative_wide_snpsUsed_final_long_notExplained$site, "_", 4)[,2]
length(sitesss)/6
unique(sitesss) %>% length()

sitesss <- str_split_fixed(sample_relative_wide_snpsUsed_final_long$site, "_", 4)[,2]
length(sitesss) /6
unique(sitesss) %>% length()

##----------------------
# meansss and medianss
##----------------------
library(patchwork)
# mediannssss <-  aggregate(.~sample+explained, data=sample_relative_wide_snpsUsed_final_long[,c("explained","sample","Median")], median, na.rm=TRUE)%>% filter(explained!="multiple")
# meanssss <-  aggregate(.~sample+explained, data=sample_relative_wide_snpsUsed_final_long[,c("explained","sample","Median")], median, na.rm=TRUE) %>% filter(explained!="multiple")
# 
# mediansplot <- ggplot(mediannssss,aes(x=sample,y=Median,color=explained,fill=explained))+geom_bar(stat="identity")+theme_classic()+theme(legend.position = "none")
# meansssplot <- ggplot(meanssss,aes(x=sample,y=Median,color=explained,fill=explained))+geom_bar(stat="identity")+theme_classic()
# 
# mediansplot+meansssplot

##---without zeros
# table(sample_relative_wide_snpsUsed_final_long_woZeros$sample)
mediannssss <-  aggregate(.~sample+explained, data=sample_relative_wide_snpsUsed_final_long_woZeros[,c("explained","sample","Median")], median, na.rm=TRUE)%>% filter(explained!="multiple")
meanssss <-  aggregate(.~sample+explained, data=sample_relative_wide_snpsUsed_final_long_woZeros[,c("explained","sample","Median")], median, na.rm=TRUE) %>% filter(explained!="multiple")

mediansplot <- ggplot(mediannssss,aes(x=sample,y=Median,color=explained,fill=explained))+geom_bar(stat="identity")+theme_classic()+theme(legend.position = "none")
meansssplot <- ggplot(meanssss,aes(x=sample,y=Median,color=explained,fill=explained))+geom_bar(stat="identity")+theme_classic()

mediansplot+meansssplot

##---------------------------------
##lineage abundance
##---------------------------------
ratio_larger <- mediannssss %>% filter(explained=="NA_NA_lin3_lin4") %>% select("sample","Median")  %>% dplyr::rename(Overall_ratio = Median) 
ratio_larger_2 <- mediannssss %>% filter(explained=="lin2") %>% select("sample","Median")  %>% dplyr::rename(rel_lin2 = Median) 
ratio_larger_3 <- mediannssss %>% filter(explained=="lin3") %>% select("sample","Median")  %>% dplyr::rename(rel_lin3 = Median) 
# ratio_larger_4 <- mediannssss %>% filter(explained=="lin4") %>% select("sample","Median")  %>% dplyr::rename(rel_lin4 = Median) 

ratio_together_1 <- merge(ratio_larger,ratio_larger_2,by="sample",all.x = TRUE)
ratio_together_2 <- merge(ratio_together_1,ratio_larger_3,by="sample",all.x = TRUE)
ratio_together_2$lin1 <- (1-ratio_together_2$Overall_ratio)*(1-ratio_together_2$rel_lin2)
ratio_together_2$lin2 <- (1-ratio_together_2$Overall_ratio)*(ratio_together_2$rel_lin2)
ratio_together_2$lin3 <- (ratio_together_2$Overall_ratio)*(ratio_together_2$rel_lin3)
ratio_together_2$lin4 <- (ratio_together_2$Overall_ratio)*(1-ratio_together_2$rel_lin3)


ratio_larger_4 <- mediannssss %>% filter(explained=="lin3") %>% select("sample","Median")  %>% dplyr::rename(rel_lin3 = Median) 


##ratios in Reference sequences
#ther is no lin3 or lin4 specific hits that is why we cannot calculate it like before. 
#lin1 is the reference everything that is not lin1 is lin2. 

REffs2 <- mediannssss %>% filter(sample=="Reference 1") %>% filter(explained=="lin2") %>% select(Median) %>% unlist() %>% as.numeric()
ratio_together_final_referenceSamples <- data.frame(sample="Reference 1",lin4=0,lin3=0,lin2=REffs2,lin1=1-REffs2)
REffs2 <- mediannssss %>% filter(sample=="Reference 2") %>% filter(explained=="lin2") %>% select(Median) %>% unlist() %>% as.numeric()
ratio_together_final_referenceSamples2 <- data.frame(sample="Reference 2",lin4=0,lin3=0,lin2=REffs2,lin1=1-REffs2)

# ratio_together_final_referenceSamples <- 

# ratio_together_2$Unknown <- 1-(ratio_together_2$lin1+ratio_together_2$lin2+ratio_together_2$lin3+ratio_together_2$lin4)

# ratio_together_final <- ratio_together_2 %>% select(sample,lin1,lin2,lin3,lin4) %>% gather(.,lineage,"Relative abundance","lin1","lin2","lin3","lin4")
ratio_together_final_tmp1 <- ratio_together_2 %>% select(sample,lin4,lin3,lin2,lin1) 

ratio_together_final <- rbind(ratio_together_final_tmp1,ratio_together_final_referenceSamples,ratio_together_final_referenceSamples2) %>% gather(.,lineage,"Relative abundance","lin4","lin3","lin2","lin1")

table(ratio_together_final$sample)
ratio_together_final$sample = factor(ratio_together_final$sample, levels=c("Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","cheesemaking\nday1","cheesemaking\nday2","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E","Reference 1","Reference 2"))





ratio_together_final$lineage = factor(ratio_together_final$lineage, levels=(c("lin4","lin3","lin2","lin1")))


# colorsss <- c("#0000FF","#6699FF","#99CCFF","#00FFFF")
colorsss <- c("#99CCFF","#00FFFF","#0000FF","#6699FF")


plot_strain_abundance <- ggplot(ratio_together_final,aes(x=sample,y=`Relative abundance`,color=lineage,fill=lineage))+geom_bar(stat="identity")+theme_classic()+theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9))+
    labs(x="")+scale_color_manual(values=colorsss)+scale_fill_manual(values=colorsss)+theme(legend.position = "none")
plot_strain_abundance
# png("~/Desktop/supp_strain_Abundance.png", width = 2800, height = 2200,res=300)
svg("~/Desktop/Projects/2019_RMK202_analysis/plot/strain_rel_abundance.svg",width=6,height=3.5)

# p3+theme(axis.title = element_blank(),axis.text.x = element_blank())
plot_strain_abundance
dev.off()

1.4 Figure 4

Figure 4. Phenotypic properties of individual strains, pairwise combination of strains, and original starter culture. A) Colony forming units (CFUs) of S. thermophilus and L. delbrueckii over 18h of growth when cultured alone, in pairwise combinations, or in the original starter cultures (RMK). The ribbons illustrate the interquartile range and the lines the modeled growth curves. (* indicates t-test, p-value<0.001) B) Acidification curves of the same samples. The ribbons illustrate the min and max pH of the different samples. C) Principal component analysis of the metabolic profiles after 24 h of growth at 37 °C. Different treatments are highlighted in colors and with the surrounding eclipse.

Figure 4. Phenotypic properties of individual strains, pairwise combination of strains, and original starter culture. A) Colony forming units (CFUs) of S. thermophilus and L. delbrueckii over 18h of growth when cultured alone, in pairwise combinations, or in the original starter cultures (RMK). The ribbons illustrate the interquartile range and the lines the modeled growth curves. (* indicates t-test, p-value<0.001) B) Acidification curves of the same samples. The ribbons illustrate the min and max pH of the different samples. C) Principal component analysis of the metabolic profiles after 24 h of growth at 37 °C. Different treatments are highlighted in colors and with the surrounding eclipse.

1.4.1 Acidification

Here, I analysis the acidification and growth rates measured on the 13.7.2020 I measured platte one which was pippetted with the robo, first ph 4.6 than milk than +culture (total 175ul)

###-----------------------------
##variables
###-----------------------------

location="../data_zenodo/non_genomic_data//acidifciation_20200714/"
plateName="20200712_rmk202_strains_curated_02.txt"
NamesWell="200714_names.csv"
SampleNamesWell="200714_samples_names.csv"

sampleName="20200712_rmk202_strains"
# replicate=03
# plateNumber=1

CALIBRATION_PH4_5=6000 #UNTILL WHICH SECOND IS PH 4.5
CALIBRATION_start_PH6_5=13000 #UNTILL WHICH SECOND IS PH 4.5
CALIBRATION_PH6_5=16000 #UNTILL WHICH SECOND IS PH 6.5
MEASURMENT_START=14000 #UNTILL WHICH SECOND IS THE MEASURMENT START

###-----------------------------
##contaminated rows
##this has to be added after the first run. see if large outliers or fermented blanks are occuring
###-----------------------------
exclude <- c("B5","A12","A8","A4")

###-----------------------------
##import
###-----------------------------

    # X20200616_test_hydro_final_02 <- read_delim("~/Desktop/Projects/2020_StarterCultureDiversity/02_ph_measurment/hydroplates_measurment/platte1/200701_plate_01_rmk_r01_final_copypaste_02.txt",  "\t", escape_double = FALSE, col_types = cols(Time = col_time(format = "%d.%H:%M:%S")), trim_ws = TRUE) %>%dplyr::select(-X99)

 X20200616_test_hydro_final_02 <- read_delim(paste0(location,plateName),  "\t", escape_double = FALSE, trim_ws = TRUE) %>% dplyr::select(-"X99")
colnames(X20200616_test_hydro_final_02)
    

    ###-----------------------------
##prep date
#unfortunately the molecular devices machine gives a weird date format (after 24h it adds 1. to the hour column)
#in order to correct it we have to run also the previous bash chunk
###-----------------------------
    
X20200616_test_hydro_final_02$days <- as.numeric(str_split_fixed(X20200616_test_hydro_final_02$`0.Time`, fixed("."), 2)[,1])
X20200616_test_hydro_final_02$time <- str_split_fixed(X20200616_test_hydro_final_02$`0.Time`, fixed("."), 2)[,2]
X20200616_test_hydro_final_02$hourOld <- as.numeric(str_split_fixed(X20200616_test_hydro_final_02$time, fixed(":"), 3)[,1])
X20200616_test_hydro_final_02$min <-  as.numeric(str_split_fixed(X20200616_test_hydro_final_02$time, fixed(":"), 3)[,2])
# X20200616_test_hydro_final_02$sec <- str_split_fixed(X20200616_test_hydro_final_02$time, fixed(":"), 3)[,3]
# X20200616_test_hydro_final_02[,90:103]
# X20200616_test_hydro_final_02$hourNew <- as.character(X20200616_test_hydro_final_02$hourOld+(24*X20200616_test_hydro_final_02$days))

X20200616_test_hydro_final_03 <- X20200616_test_hydro_final_02 %>% 
  mutate(
    days = duration(days, 'day'),
    hourOld = duration(hourOld, 'hour'),
    min = duration(min, 'minute'),
    # sec = duration(sec, 'second'),
    TIMEfinal = hourOld + days + min
  )  %>% dplyr::select(-c("days","time","hourOld",`0.Time`,`Temperature(¡C)`,"min"))

hydroplate_wide_prep <- X20200616_test_hydro_final_03


plotForCalibration <-ggplot(hydroplate_wide_prep,aes(x=TIMEfinal,y=A1))+geom_point()+theme_classic()+geom_vline(xintercept = c(CALIBRATION_PH4_5,CALIBRATION_PH6_5,MEASURMENT_START,CALIBRATION_start_PH6_5))
plotForCalibration

ggp <- ggplotly(plotForCalibration)
ggp

###-----------------------------
##pH_calibration
###-----------------------------


hydroplate_wide_ph_6.4 <- X20200616_test_hydro_final_03 %>%  filter(TIMEfinal >= (paste0(CALIBRATION_start_PH6_5,"s")) & TIMEfinal <= (paste0(CALIBRATION_PH6_5,"s"))) %>% dplyr::select(-"TIMEfinal") %>% colMeans(na.rm = TRUE) %>% as.data.frame() %>% rownames_to_column(var="well")


colnames(hydroplate_wide_ph_6.4)[2] <- "pH_6.4"

hydroplate_wide_ph_4_66 <- X20200616_test_hydro_final_03 %>% filter(TIMEfinal < (paste0(CALIBRATION_PH4_5,"s"))) %>% dplyr::select(-"TIMEfinal") %>% colMeans(na.rm = TRUE) %>% as.data.frame() %>% rownames_to_column(var="well")

# hydroplate_wide_ph_4_66 <- tail(X20200616_test_hydro_final_03,n = 10) %>% dplyr::select(-"TIMEfinal") %>% colMeans() %>% as.data.frame() %>% rownames_to_column(var="well")
colnames(hydroplate_wide_ph_4_66)[2] <- "pH_4.66"

# hydroplate_wide[90,]
hydroplate_wide_ph_calibration <- merge(hydroplate_wide_ph_6.4,hydroplate_wide_ph_4_66,by="well")
hydroplate_wide_ph_calibration_curve <- hydroplate_wide_ph_calibration
###---------------calculate ph_regression
ph_high <- as.numeric(6.5)
ph_low <- as.numeric(4.6)
# hydroplate_wide_ph_calibration_curve$slope
# hydroplate_wide_ph_calibration_curve$slope <- ((hydroplate_wide_ph_calibration_curve$pH_6.4- hydroplate_wide_ph_calibration_curve$pH_4.66)/(ph_high-ph_low))
hydroplate_wide_ph_calibration_curve$slope <- ((ph_high-ph_low)/(as.integer(hydroplate_wide_ph_calibration_curve$pH_6.4)- as.integer(hydroplate_wide_ph_calibration_curve$pH_4.66)))
# hydroplate_wide_ph_calibration_curve$intersect <- hydroplate_wide_ph_calibration_curve$pH_6.4/(hydroplate_wide_ph_calibration_curve$slope*ph_high)
hydroplate_wide_ph_calibration_curve$intersect <- ph_high-(hydroplate_wide_ph_calibration_curve$slope*as.integer(hydroplate_wide_ph_calibration_curve$pH_6.4))
# hydroplate_wide_ph_calibration_curve$intersect <- ph_low/(hydroplate_wide_ph_calibration_curve$slope*hydroplate_wide_ph_calibration_curve$pH_4.66)

# hydroplate_wide_ph_calibration_curve$test <- (as.integer(hydroplate_wide_ph_calibration_curve$pH_6.4)*hydroplate_wide_ph_calibration_curve$slope)+hydroplate_wide_ph_calibration_curve$intersect

# test_intenstiy <- 4045145
# hydroplate_wide_ph_calibration_curve$slope*test_intenstiy+hydroplate_wide_ph_calibration_curve$intersect

hydroplate_calbration_test <- hydroplate_wide_ph_calibration_curve %>% dplyr::select(well,slope,intersect)

###-----------------------------
##merge samples
###-----------------------------
hydroplate_long <- gather(hydroplate_wide_prep, sample, pH, colnames(hydroplate_wide_prep)[1:96], factor_key=TRUE,na.rm = TRUE) 
hydroplate_wide_ph_calibration_02 <- merge(hydroplate_long,hydroplate_calbration_test,by.x = "sample",by.y = "well")
hydroplate_wide_ph_calibration_02$intensity_calibrated <- hydroplate_wide_ph_calibration_02$slope*hydroplate_wide_ph_calibration_02$pH+hydroplate_wide_ph_calibration_02$intersect

# ggplot(hydroplate_wide_ph_calibration_02,aes(x=TIMEfinal,y=intensity_calibrated,group=sample,fill=sample,color=sample))+geom_point()+theme_classic()

hydroplate_wide_ph_calibration_03 <-hydroplate_wide_ph_calibration_02
# hydroplate_wide_ph_calibration_03 <- hydroplate_wide_ph_calibration_02 %>% filter(TIMEfinal >= ('32000s'))
# hydroplate_wide_ph_calibration_03 <- hydroplate_wide_ph_calibration_02 %>% filter(TIMEfinal >= ('32000s')&TIMEfinal < ('150'))

# hydroplate_wide_ph_calibration_03 <- hydroplate_wide_ph_calibration_02 %>% filter(TIMEfinal < ('40000s'))

ggplot(hydroplate_wide_ph_calibration_03,aes(x=TIMEfinal,y=intensity_calibrated,group=sample,fill=sample,color=sample))+geom_point()+theme_classic()

###-----------------------------
##add names
###-----------------------------

hydroplates_names <- read_delim(paste0(location,NamesWell),  "\t", escape_double = FALSE,  trim_ws = TRUE)
hydroplates_names_samples <- read_delim(paste0(location,SampleNamesWell),  "\t", escape_double = FALSE,  trim_ws = TRUE)


hydroplates_names_final <- merge(hydroplates_names,hydroplates_names_samples,by="SAMPLE",all = TRUE)


hydroplate_wide_ph_calibration_04 <- merge(hydroplate_wide_ph_calibration_03,hydroplates_names_final,by.x = "sample",by.y = "well") %>% filter(SAMPLE!="blank")
# table(hydroplate_wide_ph_calibration_04$no_growth)
table(hydroplate_wide_ph_calibration_04$SAMPLE)
table(hydroplate_wide_ph_calibration_04$sample)

###-----------------------------
##exclude if necessary
###-----------------------------

hydroplate_wide_ph_calibration_05 <- hydroplate_wide_ph_calibration_04 %>% filter(!sample %in% exclude)


htmlPrep <- ggplot(hydroplate_wide_ph_calibration_05,aes(x=TIMEfinal,y=intensity_calibrated,group=sample,fill=SAMPLE,color=SAMPLE))+geom_point()+theme_classic()+facet_wrap(~SAMPLE)+labs(y="pH",x="")+geom_vline(xintercept = c(CALIBRATION_PH4_5,CALIBRATION_PH6_5,MEASURMENT_START))
# htmlPrep <- ggplot(hydroplate_wide_ph_calibration_05,aes(x=TIMEfinal,y=intensity_calibrated,group=sample,fill=sample,color=sample))+geom_point()+theme_classic()+labs(y="pH",x="")
# htmlPrep
ggp <- ggplotly(htmlPrep)
ggp
htmlwidgets::saveWidget(ggp, paste0(location,sampleName,"_cleaned.html"))

# hydroplate_wide_ph_calibration_05 <- hydroplate_wide_ph_calibration_05 %>% filter(TIMEfinal > (paste0(MEASURMENT_START,"s")))

hydroplate_wide_ph_calibration_05 <- hydroplate_wide_ph_calibration_05 %>% filter(TIMEfinal > (paste0(CALIBRATION_start_PH6_5,"s")))

##-----------------------------------------------------------
#modelling
##-----------------------------------------------------------
Sys.sleep(10)

p     <- c(y0 = 6.608016, mumax = 0.0003549404, K = 4.703572,h0=11.044679)
# 
# lower   <- c(y0 = 5, mumax = 0.5, K = 3.5, h0 = 1)
# upper   <- c(y0 = 8,   mumax = 2.5,    K = 7,   h0 = 10)

hydroplate_wide_ph_calibration_05$TIME <- as.numeric(hydroplate_wide_ph_calibration_05$TIMEfinal)

many_baranyi_sub <- all_growthmodels(
                   intensity_calibrated ~ grow_baranyi(TIMEfinal, parms) | sample+NAME+grouping,
                   data = hydroplate_wide_ph_calibration_05, p=p,ncores = 8 )

# results(many_baranyi_sub)

par(mfrow = c(12, 8))
par(mar = c(1, 1, 1, 1))
plot(many_baranyi_sub)




many_baranyi2_res <- results(many_baranyi_sub)
many_baranyi2_res$lagPhase <- many_baranyi2_res$h0/log(2)
many_baranyi2_res
many_baranyi2_res$lagPhase <- many_baranyi2_res$h0/many_baranyi2_res$mumax
many_baranyi2_res_preped <- many_baranyi2_res
# many_baranyi2_res_preped <- many_baranyi2_res  %>% filter(name!="blank")
# many_baranyi2_res_preped <- many_baranyi2_res %>% filter(r2>0.98) %>% filter(name!="blank")
many_baranyi2_res_preped$mumax <- -many_baranyi2_res_preped$mumax
summary(many_baranyi2_res_preped)
table(many_baranyi2_res_preped$NAME)
many_baranyi2_res_preped$grouping = factor(many_baranyi2_res_preped$grouping, levels=c("lactobacillus","streptococcus","pairwise","complex"))
mumax_plot <- ggplot(many_baranyi2_res_preped,aes(x=NAME,y=mumax))+geom_boxplot()+theme_classic()+labs(x="",y="maximum pH decrease")+facet_wrap(~grouping,ncol = 5,scales = "free_x")
# ggplot(many_baranyi2_res_preped,aes(x=name,y=y0))+geom_boxplot()+theme_classic()+labs(x="","intitial pH")
k0plot <- ggplot(many_baranyi2_res_preped,aes(x=NAME,y=K))+geom_boxplot()+theme_classic()+labs(x="",y="lowest pH")+facet_wrap(~grouping,ncol = 5,scales = "free_x")
# ggplot(many_baranyi2_res_preped,aes(x=name,y=h0))+geom_boxplot()+theme_classic()


###-------------------------------------------------
##plot
###-------------------------------------------------

png(paste0(location,sampleName,"_all_baranyi.png"),width=4000,height=2000,res=300)
# svg("~/Desktop/Projects/2020_strainDelineation/04_pH_measurments/191127_pH_PLOT.svg",width=4,height=3)
par(mfrow = c(12, 8))
par(mar = c(1, 1, 1, 1))
plot(many_baranyi_sub)
dev.off()

finalSamplesPlot <- ggplot(hydroplate_wide_ph_calibration_05,aes(x=TIMEfinal,y=intensity_calibrated,group=NAME,fill=NAME,color=NAME))+geom_point()+theme_classic()+facet_wrap(~NAME)+labs(y="pH",x="")+theme(legend.position = "none")

svg(paste0(location,sampleName,"_all_samples_pH.svg"),width=12,height=8)
finalSamplesPlot
dev.off()


svg(paste0(location,sampleName,"_all_boxplots_summary.svg"),width=4,height=4)
mumax_plot + k0plot+plot_layout(nrow = 2)
dev.off()

mumax_plot + k0plot+plot_layout(nrow = 2)

###-------------------------------------------------
##output data
###-------------------------------------------------
many_baranyi2_res_preped$plate <- as.character(plateNumber)
many_baranyi2_res_preped$replicate <- as.character(replicate)

write.table(many_baranyi2_res_preped,paste0(location,sampleName,"_model.txt"),na = "", quote = FALSE, sep = "\t",row.names = FALSE, col.names = TRUE)


###-------------------------------------------------
##lagPHase phased
##h0 parameter specifying the initial physiological state of organisms (e.g. cells) and in consequence the lag phase (h0 = max growth rate * lag phase).
###-------------------------------------------------
str(many_baranyi2_res)
library(tidyverse)
many_baranyi2_res$sample
preped <- many_baranyi2_res %>% dplyr::select(c("sample","lagPhase")) %>% remove_rownames()

hydroplate_wide_ph_calibration_06 <- merge(hydroplate_wide_ph_calibration_05,preped,by="sample")

 hydroplate_wide_ph_calibration_06 <- hydroplate_wide_ph_calibration_06 %>% 
  mutate(lagSeconds = duration(lagPhase, 'second'))  #%>% dplyr::select(-c("days","time","hourOld",`0.Time`,`Temperature(¡C)`,"min"))

 hydroplate_wide_ph_calibration_06$curatedTime <- hydroplate_wide_ph_calibration_06$TIMEfinal-hydroplate_wide_ph_calibration_06$lagSeconds
 
  hydroplate_wide_ph_calibration_06$curatedTime <- ifelse(hydroplate_wide_ph_calibration_06$grouping=="lactobacillus",hydroplate_wide_ph_calibration_06$TIMEfinal-(0.75*hydroplate_wide_ph_calibration_06$lagSeconds),hydroplate_wide_ph_calibration_06$TIMEfinal-hydroplate_wide_ph_calibration_06$lagSeconds)

 
htmlPrep <- ggplot(hydroplate_wide_ph_calibration_06,aes(x=curatedTime,y=intensity_calibrated,group=sample,fill=grouping,color=grouping))+geom_point(alpha=0.1)+theme_classic()
 
ggp <- ggplotly(htmlPrep)
ggp
# hydroplate_wide_ph_calibration_06$grouping
htmlPrep <- ggplot(hydroplate_wide_ph_calibration_06,aes(x=curatedTime,y=intensity_calibrated,group=sample,fill=grouping,color=grouping))+geom_point()+theme_classic()+facet_wrap(.~grouping,ncol = 4)
 
ggp <- ggplotly(htmlPrep)
ggp



hydroplate_wide_ph_calibration_06_sub <- hydroplate_wide_ph_calibration_06 %>%
 filter(row_number() %% 20 == 1)

htmlPrep <- ggplot(hydroplate_wide_ph_calibration_06_sub,aes(x=curatedTime,y=intensity_calibrated,group=sample,fill=grouping,color=grouping))+geom_point(alpha=0.1)+theme_classic()
 
ggp <- ggplotly(htmlPrep)
ggp

###-------------------------------------
##make a subsetting and averaging
###-------------------------------------

# TS <- zoo(c(4, 5, 7, 3, 9, 8))
# rollapply(TS, width = 3, by = 2, FUN = mean, align = "left")
final_subsetting <- data_frame()
for (groupsss  in unique(hydroplate_wide_ph_calibration_06$grouping)) {
  # hydroplate_wide_ph_calibration_06$
  tmp <- hydroplate_wide_ph_calibration_06 %>% filter(grouping==groupsss)%>% arrange(curatedTime)
  
  min <- rollapply(tmp$intensity_calibrated, width = 10, by = 5, FUN = min, align = "left")
  mean <- rollapply(tmp$intensity_calibrated, width = 10, by = 5, FUN = mean, align = "left")
  max <- rollapply(tmp$intensity_calibrated, width = 10, by = 5, FUN = max, align = "left")
 namesss <- tmp %>% filter(row_number() %% 5 == 1) 
 namesssss <- namesss[1:length(max),"curatedTime"]
  length(max)

 tmp02 <- data_frame(time=namesssss,grouping=groupsss,minum=min,maxum=max,median=mean)
  final_subsetting <- rbind(final_subsetting,tmp02)
}

table(final_subsetting$grouping)


 final_subsetting_subset <- final_subsetting %>% filter(time<64800)

final_subsetting_subset$timeFinal <- final_subsetting_subset$time/3600

final_subsetting_subset_final <- final_subsetting_subset %>% filter(grouping!="complex")

minTime <- min(final_subsetting_subset_final$timeFinal)

colorsss <- c("#0081a7ff","#fec3b7ff","#9f9f92ff","#6d6466ff","#4e3d42ff")
colorss_02 <- c("#fec3b7ff","#9f9f92ff","#6d6466ff","#0081a7ff")


low_phased_ribbon <- ggplot()+
    geom_ribbon(aes(x=timeFinal,ymin = minum , ymax = maxum,group=grouping,fill=grouping),data = final_subsetting_subset_final, alpha=.4)+
       # scale_x_discrete( expand = c(0, 0)) +
  # facet_wrap(~grouping,scales = "free")+
    labs(y="pH",x="phased incubation time")+
  scale_fill_manual(values = colorss_02)+
    scale_x_continuous(breaks =c(0,6,12,18),labels=c(0,6,12,18))+
  theme_classic()+
 theme(rect = element_rect(fill = "transparent"), # all rectangles      #axis.text.x = element_blank(),
          legend.position="bottom",
       legend.title = element_blank(),
       # axis.text.x = element_blank(),
       axis.text.y = element_text(size=8),
       # axis.title = element_text(size=9),
       
            plot.margin=unit(c(t = 0, r = 0.5, b = 0, l = 0.1),"cm")
       )


  low_phased_ribbon
  
  

  
     svg("03_results/acidifaction_curve.svg",width=6,height=4)

  low_phased_ribbon

dev.off()
 

##------------------
##look at offset
##------------------
colnames(hydroplate_wide_ph_calibration_06)
lagsss <- hydroplate_wide_ph_calibration_06 %>% dplyr::select(c("grouping","lagPhase"))  %>% remove_rownames() %>% unique()
lagsss_mean <- aggregate(. ~grouping, data=lagsss, median, na.rm=TRUE)
lagsss_mean$hours <- lagsss_mean$lagPhase/3600
lagsss_mean$hours_corrected <- lagsss_mean$hours-2

1.4.2 Growth data

CFU count of the same time series

rmk202_strain_timeseries_Sheet1 <- read_csv("../data_zenodo/non_genomic_data/rmk202_growth_strain_timeseries.csv", skip = 1)

rmk202_strain_timeseries_Sheet1_long <- rmk202_strain_timeseries_Sheet1 %>% gather(.,"sampless","CFU",`0_BM`:`24_MR`)
 rmk202_strain_timeseries_Sheet1_long$time <- as.numeric(str_split_fixed(rmk202_strain_timeseries_Sheet1_long$sampless, "_", 2)[,1])
 rmk202_strain_timeseries_Sheet1_long$plate <- str_split_fixed(rmk202_strain_timeseries_Sheet1_long$sampless, "_", 2)[,2]

 table(rmk202_strain_timeseries_Sheet1_long$plate)
 rmk202_strain_timeseries_Sheet1_long$species  <- plyr::revalue(rmk202_strain_timeseries_Sheet1_long$plate, c("BM"="total","M17X"="S. thermophilus","MR"="L. delbrueckii"))

 rmk202_strain_timeseries_Sheet1_long$samplecount <- 1:nrow(rmk202_strain_timeseries_Sheet1_long)
 
plot_overtime <-  ggplot(rmk202_strain_timeseries_Sheet1_long,aes(x=time,y=CFU,color=species,fill=species,group=sample,text =paste("sampleID=",samplecount)))+geom_line()+theme_classic()+facet_wrap(~species+group,ncol=5)


 library(plotly)
 ggp <- ggplotly(plot_overtime)
ggp


##----------------------
##exlude
#not grown
##----------------------
excludes <- c("228","276","277","282","286","288")

rmk202_strain_timeseries_Sheet1_long_cleaned <- rmk202_strain_timeseries_Sheet1_long %>% filter(!samplecount %in%excludes)


plot_overtime <-  ggplot(rmk202_strain_timeseries_Sheet1_long_cleaned,aes(x=time,y=CFU,color=species,fill=species,group=sample,text =paste("sampleID=",samplecount)))+geom_line()+theme_classic()+facet_wrap(~species+group,ncol=5,scales = "free")

 ggp <- ggplotly(plot_overtime)
ggp


 ggplot(rmk202_strain_timeseries_Sheet1_long_cleaned,aes(x=time,y=CFU,color=species,fill=species,group=sample,text =paste("sampleID=",samplecount)))+geom_boxplot()+theme_classic()+facet_wrap(~species+group,ncol=5)+coord_trans(y="log2")

##----------------------
##mean , max ,median
##----------------------
 
 group_growth_cfu <- data.frame()
for (typess in unique(rmk202_strain_timeseries_Sheet1_long_cleaned$group)) {
  
  for (timesss in unique(rmk202_strain_timeseries_Sheet1_long_cleaned$time)) {
    
    
    for (speccc in unique(rmk202_strain_timeseries_Sheet1_long_cleaned$species)) {
      
    
    maxxx <- rmk202_strain_timeseries_Sheet1_long_cleaned %>% filter(group==typess) %>% filter(time==timesss) %>% filter(species==speccc)%>% select(CFU) %>% max(na.rm = TRUE)
    minnn <- rmk202_strain_timeseries_Sheet1_long_cleaned %>% filter(group==typess) %>% filter(time==timesss) %>% filter(species==speccc)%>% select(CFU)%>% min(na.rm = TRUE)
    meannn <- rmk202_strain_timeseries_Sheet1_long_cleaned %>% filter(group==typess) %>% filter(time==timesss)%>% filter(species==speccc)%>% select(CFU) %>% colMeans(na.rm = TRUE)

    tmps <- data.frame(sample=typess,species=speccc,time=timesss,min=minnn,max=maxxx,mean=meannn)
    
    group_growth_cfu <- rbind(group_growth_cfu,tmps)
    
    }
  }
} 
 
 group_growth_cfu$time <- as.numeric(group_growth_cfu$time)
 
 group_growth_cfu <- group_growth_cfu %>%  filter(mean!="NaN")
plot_overtime <-  ggplot(group_growth_cfu,aes(x=time,y=mean,color=sample,fill=sample,group=interaction(sample,species)))+geom_line()+theme_classic()+facet_wrap(~species,nrow=3)
plot_overtime
#  ggp <- ggplotly(plot_overtime)
# ggp

##----------------------
##polish
##----------------------

group_growth_cfu_mean_species <- group_growth_cfu %>% filter(species!="total") %>% filter(sample!="all_strains")#%>% filter(!is.na(mean))

plot_overtime <-  ggplot(group_growth_cfu_mean_species,aes(x=time,y=mean,color=sample,fill=sample,group=interaction(sample,species)))+geom_line()+theme_classic()+facet_wrap(~species,nrow=2)
plot_overtime


plot_overtime_RIBBON <-  ggplot()+geom_ribbon(aes(x=time,ymin = min , ymax = max,group=interaction(sample,species),fill=sample),data = group_growth_cfu_mean_species, alpha=.1)+theme_classic()+facet_wrap(~species,nrow=2)+geom_line(aes(x=time,y=mean,color=sample,fill=sample,group=interaction(sample,species)),data = group_growth_cfu_mean_species)
plot_overtime_RIBBON


##----------------------
##polish
##----------------------
rmk202_strain_timeseries_Sheet1_long_cleaned_box <- rmk202_strain_timeseries_Sheet1_long_cleaned %>% filter(species!="total") %>% filter(group!="all_strains")#%>% filter(!is.na(mean))

rmk202_strain_timeseries_Sheet1_long_cleaned_box$time <- as.factor(rmk202_strain_timeseries_Sheet1_long_cleaned_box$time)
rmk202_strain_timeseries_Sheet1_long_cleaned_box$species = factor(rmk202_strain_timeseries_Sheet1_long_cleaned_box$species, levels=c("S. thermophilus" ,"L. delbrueckii"))


plot_overtime_box <-  ggplot(rmk202_strain_timeseries_Sheet1_long_cleaned_box,aes(x=time,y=CFU,fill=group))+geom_boxplot(alpha=.3,color="grey")+theme_classic()+facet_wrap(~species,nrow=2)
plot_overtime_box


rmk202_strain_timeseries_Sheet1_long_cleaned_box$groupingFinal <- as.factor(paste0(rmk202_strain_timeseries_Sheet1_long_cleaned_box$species,"_",rmk202_strain_timeseries_Sheet1_long_cleaned_box$group))
levels(rmk202_strain_timeseries_Sheet1_long_cleaned_box$groupingFinal)
rmk202_strain_timeseries_Sheet1_long_cleaned_box$groupingFinal = factor(rmk202_strain_timeseries_Sheet1_long_cleaned_box$groupingFinal, levels=c("S. thermophilus_rmk" ,"S. thermophilus_pairwise","S. thermophilus_strepto" ,"S. thermophilus_lacto","L. delbrueckii_rmk","L. delbrueckii_pairwise","L. delbrueckii_lacto","L. delbrueckii_strepto"))



plot_overtime_box <-  ggplot(rmk202_strain_timeseries_Sheet1_long_cleaned_box,aes(x=time,y=CFU,fill=groupingFinal))+geom_boxplot(alpha=.3,color="grey")+theme_classic()
plot_overtime_box


##----------------------
##model the data with bayani
##----------------------


Sys.sleep(5)
p     <- c(y0 = 500000, mumax = 1, K = 800000000,h0=8.044679)

# rmk202_strain_timeseries_Sheet1_long_cleaned_box$time <- as.numeric(rmk202_strain_timeseries_Sheet1_long_cleaned_box$time)
rmk202_strain_timeseries_Sheet1_long_cleaned_model <- rmk202_strain_timeseries_Sheet1_long_cleaned %>% filter(species!="total") %>% filter(group!="all_strains")
rmk202_strain_timeseries_Sheet1_long_cleaned_model  <- rmk202_strain_timeseries_Sheet1_long_cleaned_model %>% filter(!is.na(CFU))
rmk202_strain_timeseries_Sheet1_long_cleaned_model$groupingFinal <- as.factor(paste0(rmk202_strain_timeseries_Sheet1_long_cleaned_model$species,"_",rmk202_strain_timeseries_Sheet1_long_cleaned_model$group))


##----------model1
many_baranyi_sub <- all_growthmodels(
                   CFU ~ grow_baranyi(time, parms) | groupingFinal,
                   data = rmk202_strain_timeseries_Sheet1_long_cleaned_model, p=p,ncores = 8 )

results(many_baranyi_sub)

par(mfrow = c(12, 8))
par(mar = c(1, 1, 1, 1))
plot(many_baranyi_sub)


many_baranyi2_res <- results(many_baranyi_sub)
many_baranyi2_res
many_baranyi2_res_preped <- many_baranyi2_res
many_baranyi2_res_preped$mumax <- -many_baranyi2_res_preped$mumax
summary(many_baranyi2_res_preped)
table(many_baranyi2_res_preped$name)
mumax_plot <- ggplot(many_baranyi2_res_preped,aes(x=groupingFinal,y=mumax))+geom_boxplot()+theme_classic()+labs(x="",y="maximum pH decrease")
k0plot <- ggplot(many_baranyi2_res_preped,aes(x=groupingFinal,y=K))+geom_boxplot()+theme_classic()+labs(x="",y="lowest pH")

mumax_plot + k0plot+plot_layout(nrow = 2)


###---------------------
#boxplot
###---------------------
library(growthcurver)

fill_colers <- c("grey77","grey55","red","grey88","grey55","orange")
colorr_colers <- c("red","red","red","orange","orange","orange")

plot_overtime_box <-  ggplot()+geom_boxplot(data=rmk202_strain_timeseries_Sheet1_long_cleaned_box,aes(x=time,y=CFU,fill=groupingFinal,color=groupingFinal),alpha=.2)+scale_fill_manual(values=fill_colers)+scale_color_manual(values=colorr_colers)+theme_classic()+theme(legend.position = "bottom")
plot_overtime_box

fill_colers <- c("grey77","grey55","red","grey88","grey55","orange")
colorr_colers <- c("grey77","grey77","grey77","grey55","grey55","grey55")



plot_overtime_box <-  ggplot()+geom_boxplot(data=rmk202_strain_timeseries_Sheet1_long_cleaned_box,aes(x=time,y=CFU,fill=groupingFinal,color=groupingFinal),alpha=.3)+scale_fill_manual(values=colorr_colers)+scale_color_manual(values=fill_colers)+theme_classic()+theme(legend.position = "bottom")+facet_wrap(~time+species,ncol=14,scales = "free_x")
plot_overtime_box


rmk202_strain_timeseries_Sheet1_long_cleaned_box$time


##----------------------
##ribbon of interquartile range
##----------------------


group_growth_cfu_quantiles <- data.frame()
for (typess in unique(rmk202_strain_timeseries_Sheet1_long_cleaned$group)) {
  
  for (timesss in unique(rmk202_strain_timeseries_Sheet1_long_cleaned$time)) {
    
    
    for (speccc in unique(rmk202_strain_timeseries_Sheet1_long_cleaned$species)) {
      
      tmp <- rmk202_strain_timeseries_Sheet1_long_cleaned %>% filter(group==typess) %>% filter(time==timesss) %>% filter(species==speccc)%>% dplyr::select(CFU) %>% quantile(na.rm=TRUE) %>% as.data.frame()
      
      
      s25ss <- tmp[2,1]
      s75ss <- tmp[4,1]
meannn <- tmp[3,1]
      # quantile(x)
    # 
    # maxxx <- rmk202_strain_timeseries_Sheet1_long_cleaned %>% filter(group==typess) %>% filter(time==timesss) %>% filter(species==speccc)%>% select(CFU) %>% max(na.rm = TRUE)
    # minnn <- rmk202_strain_timeseries_Sheet1_long_cleaned %>% filter(group==typess) %>% filter(time==timesss) %>% filter(species==speccc)%>% select(CFU)%>% min(na.rm = TRUE)
    # meannn <- rmk202_strain_timeseries_Sheet1_long_cleaned %>% filter(group==typess) %>% filter(time==timesss)%>% filter(species==speccc)%>% select(CFU) %>% colMeans(na.rm = TRUE)
    # 
    tmps <- data.frame(sample=typess,species=speccc,time=timesss,twentyfifths=s25ss,seventhts=s75ss,mean=meannn)
    # 
    group_growth_cfu_quantiles <- rbind(group_growth_cfu_quantiles,tmps)
    
    }
  }
} 

# group_growth_cfu_quantiles


group_growth_cfu_quantiles_spec <- group_growth_cfu_quantiles %>% filter(species!="total") %>% filter(sample!="all_strains")%>% filter(!is.na(mean))

# plot_overtime <-  ggplot(group_growth_cfu_quantiles_spec,aes(x=time,y=mean,color=sample,fill=sample,group=interaction(sample,species)))+geom_line()+theme_classic()+facet_wrap(~species,nrow=2)
# plot_overtime
group_growth_cfu_quantiles_spec$groupingFinal <- as.factor(paste0(group_growth_cfu_quantiles_spec$species,"_",group_growth_cfu_quantiles_spec$sample))
group_growth_cfu_quantiles_spec$groupingFinal = factor(group_growth_cfu_quantiles_spec$groupingFinal, levels=c("S. thermophilus_rmk" ,"S. thermophilus_pairwise","S. thermophilus_strepto" ,"S. thermophilus_lacto","L. delbrueckii_rmk","L. delbrueckii_pairwise","L. delbrueckii_lacto","L. delbrueckii_strepto"))


# fill_colers <- c("grey77","grey55","red","grey88","grey55","orange")
fill_colers <- c("grey77","#97BC62FF","red","grey88","#339E66FF","orange")
# colorr_colers <- c("grey77","grey77","grey77","grey55","grey55","grey55")
fill_colers <- c("#FC766AFF","#783937FF","#F1AC88FF","#339E66FF","#078282FF","#95DBE5FF")

maxValue <- max(group_growth_cfu_quantiles_spec$seventhts)
plot_overtime_RIBBON <-  ggplot()+geom_ribbon(aes(x=time,ymin = twentyfifths , ymax = seventhts,group=interaction(sample,species),fill=groupingFinal),data = group_growth_cfu_quantiles_spec, alpha=.2)+theme_classic()+lims(x=c(0,24),y=c(0,maxValue))+scale_fill_manual(values=fill_colers)+scale_color_manual(values=fill_colers)+scale_x_continuous(breaks =c(0,6,12,18,24),labels=c(0,6,12,18,24))
plot_overtime_RIBBON


###models--------------quartile
group_growth_cfu_quantiles_spec$twentyfifths
df.predicted_model_upper <- data.frame()
df.predicted_model_lower <- data.frame()

for (samplezzz in unique(group_growth_cfu_quantiles_spec$groupingFinal)){
  
  tmp <- group_growth_cfu_quantiles_spec %>% filter(groupingFinal==samplezzz)
  
  # tmp$time <- as.double(tmp$time)
  model.wt <- SummarizeGrowth(tmp$time, tmp$seventhts)
  model.wt_lower <- SummarizeGrowth(tmp$time, tmp$twentyfifths)

# predict(model.wt$model)
# model.wt$data
tt <- seq(0,24, length=50)
# predict(model.wt$model,newdata=list(t=tt))

# data(model.wt$model)
# df.predicted <- data.frame(time = tmp$time, pred.wt = predict(model.wt$model,))
tmp <- data.frame(group=samplezzz,time = tt, pred.wt_upper = predict(model.wt$model,newdata=list(t=tt)), pred.wt_lower = predict(model.wt_lower$model,newdata=list(t=tt)))


df.predicted_model_upper <- rbind(df.predicted_model_upper,tmp)

# plot_overtime_box + geom_line(data=df.predicted, aes(x=time,y=pred.wt), color="red")
}

logCFU <- ggplot()+geom_ribbon(aes(x=time,ymin = pred.wt_lower , ymax = pred.wt_upper,group=group,fill=group),data = df.predicted_model_upper, alpha=.2)+theme_classic()+scale_y_continuous(trans = 'log10')+labs(y="CFU/ml")#+coord_trans(y="log10")

nonLOG <- ggplot()+geom_ribbon(aes(x=time,ymin = pred.wt_lower , ymax = pred.wt_upper,group=group,fill=group),data = df.predicted_model_upper, alpha=.2)+theme_classic()+labs(y="CFU/ml")

logCFU+nonLOG+plot_layout(nrow = 2)


##----------------------------------
##modelling
##----------------------------------
##----------model2
df.predicted_model <- data.frame()
for (samplezzz in unique(rmk202_strain_timeseries_Sheet1_long_cleaned_model$groupingFinal)){
  
  tmp <- rmk202_strain_timeseries_Sheet1_long_cleaned_model %>% filter(groupingFinal==samplezzz)
  # tmp$time <- as.double(tmp$time)
  model.wt <- SummarizeGrowth(tmp$time, tmp$CFU)
# predict(model.wt$model)
# model.wt$data
tt <- seq(0,24, length=50)
# predict(model.wt$model,newdata=list(t=tt))

# data(model.wt$model)
# df.predicted <- data.frame(time = tmp$time, pred.wt = predict(model.wt$model,))
tmp <- data.frame(group=samplezzz,time = tt, pred.wt = predict(model.wt$model,newdata=list(t=tt)))


df.predicted_model <- rbind(df.predicted_model,tmp)

# plot_overtime_box + geom_line(data=df.predicted, aes(x=time,y=pred.wt), color="red")
}

df.predicted_model$group = factor(df.predicted_model$group, levels=c("S. thermophilus_rmk" ,"S. thermophilus_pairwise","S. thermophilus_strepto" ,"S. thermophilus_lacto","L. delbrueckii_rmk","L. delbrueckii_pairwise","L. delbrueckii_lacto","L. delbrueckii_strepto"))

 df.predicted_model$species <- str_split_fixed(df.predicted_model$group, "_", 2)[,1]


plot_overtime_model <- ggplot()+ geom_line(data=df.predicted_model, aes(x=time,y=pred.wt,group=group,color=group,linetype=species),size =1.25)+theme_classic()+lims(x=c(0,24),y=c(0,maxValue))+scale_color_manual(values=fill_colers)+scale_linetype_manual(values=c("dashed", "dotted"))+scale_x_continuous(breaks =c(0,6,12,18,24),labels=c(0,6,12,18,24))
plot_overtime_model
# 
plot_overtime_RIBBON + plot_overtime_model+plot_layout(nrow = 2)
svg("03_results/growth_curve.svg",width=8,height=8)
# 
 plot_overtime_RIBBON + plot_overtime_model+plot_layout(nrow = 2)

 
dev.off()

##--------------------------------
##phase
##--------------------------------

 df.predicted_model$groupsss <- str_split_fixed(df.predicted_model$group, "_", 2)[,2]


df.predicted_model$group
lagsss_mean

lagsss_mean$grouping_new <- plyr::revalue(lagsss_mean$grouping, c("lactobacillus"="lacto", "starterCulture"="rmk", "streptococcus"="strepto"))
  lagsss_mean_prepped <- lagsss_mean %>%  dplyr::select(c("grouping_new","hours","hours_corrected"))

  df.predicted_model_extended <- merge(df.predicted_model,lagsss_mean_prepped,by.x="groupsss",by.y="grouping_new") 
  # df.predicted_model_extended$timeCurated <- df.predicted_model_extended$time -df.predicted_model_extended$hours_corrected
  
  df.predicted_model_extended$timeCurated <-  ifelse(df.predicted_model_extended$groupsss=="rmk",df.predicted_model_extended$time -df.predicted_model_extended$hours,df.predicted_model_extended$time -df.predicted_model_extended$hours_corrected)
  
  
df.predicted_model_extended <- df.predicted_model_extended %>% filter(timeCurated<=18) %>% filter(timeCurated>=minTime)
  
table(df.predicted_model_extended$group)
df.predicted_model_extended$typess <- plyr::revalue(df.predicted_model_extended$group, c("S. thermophilus_rmk"="RMK202", "L. delbrueckii_rmk"="RMK202","S. thermophilus_pairwise"="pairwise","L. delbrueckii_pairwise"="pairwise","S. thermophilus_strepto"="isolate","L. delbrueckii_lacto"="isolate"))
table(df.predicted_model_extended$typess)

# plot_overtime_model <- ggplot()+ geom_line(data=df.predicted_model_extended, aes(x=timeCurated,y=pred.wt,group=group,color=group,linetype=species),size =1.25)+theme_classic()+lims(x=c(minTime,18),y=c(0,maxValue))+labs(y="CFU/ml")+scale_color_manual(values=fill_colers)+scale_linetype_manual(values=c("dashed", "dotted","solid"))+scale_x_continuous(breaks =c(0,6,12,18),labels=c(0,6,12,18)) +theme(legend.position = "none")#+coord_trans(y="log2")
# plot_overtime_model
  
colorsss_01 <- c("#fec3b7ff","#0081a7ff")

plot_overtime_model <- ggplot()+ geom_line(data=df.predicted_model_extended, aes(x=timeCurated,y=pred.wt,group=group,color=species,linetype=typess),size =1.25)+theme_classic()+lims(x=c(minTime,18),y=c(0,maxValue))+labs(y="CFU/ml")+scale_color_manual(values=colorsss_01)+scale_linetype_manual(values=c("dashed", "dotted","solid"))+scale_x_continuous(breaks =c(0,6,12,18),labels=c(0,6,12,18)) #+theme(legend.position = "none")#+coord_trans(y="log2")
plot_overtime_model
##-------------------Ribbon

group_growth_cfu_quantiles_spec$groupingFinal <- as.factor(paste0(group_growth_cfu_quantiles_spec$species,"_",group_growth_cfu_quantiles_spec$sample))
group_growth_cfu_quantiles_spec$groupingFinal = factor(group_growth_cfu_quantiles_spec$groupingFinal, levels=c("S. thermophilus_rmk" ,"S. thermophilus_pairwise","S. thermophilus_strepto" ,"S. thermophilus_lacto","L. delbrueckii_rmk","L. delbrueckii_pairwise","L. delbrueckii_lacto","L. delbrueckii_strepto"))
 # df.predicted_model$species <- str_split_fixed(group_growth_cfu_quantiles_spec$group, "_", 2)[,1]


# fill_colers <- c("grey77","grey55","red","grey88","grey55","orange")
fill_colers <- c("grey77","#97BC62FF","red","grey88","#339E66FF","orange")
# colorr_colers <- c("grey77","grey77","grey77","grey55","grey55","grey55")
fill_colers <- c("#FC766AFF","#783937FF","#F1AC88FF","#339E66FF","#078282FF","#95DBE5FF")
maxValue <- max(group_growth_cfu_quantiles_spec$seventhts)
minnsValue <- min(group_growth_cfu_quantiles_spec$seventhts)

group_growth_cfu_quantiles_spec_curated <- merge(group_growth_cfu_quantiles_spec,lagsss_mean_prepped,by.x="sample",by.y="grouping_new") 
# group_growth_cfu_quantiles_spec_curated$timeCurated <- group_growth_cfu_quantiles_spec_curated$time -group_growth_cfu_quantiles_spec_curated$hours_corrected

group_growth_cfu_quantiles_spec_curated$timeCurated <- ifelse(group_growth_cfu_quantiles_spec_curated$sample=="rmk",group_growth_cfu_quantiles_spec_curated$time -group_growth_cfu_quantiles_spec_curated$hours,group_growth_cfu_quantiles_spec_curated$time -group_growth_cfu_quantiles_spec_curated$hours_corrected)
group_growth_cfu_quantiles_spec_curated <- group_growth_cfu_quantiles_spec_curated  %>% filter(timeCurated>=minTime)#%>% filter(timeCurated<=18) %>% filter(timeCurated>=minTime)



# plot_overtime_RIBBON <-  ggplot()+geom_ribbon(aes(x=timeCurated,ymin = twentyfifths , ymax = seventhts,group=interaction(sample,species),fill=groupingFinal),data = group_growth_cfu_quantiles_spec_curated, alpha=.2)+theme_classic()+lims(x=c(minTime,18),y=c(minnsValue,maxValue))+scale_fill_manual(values=fill_colers)+scale_color_manual(values=fill_colers)+scale_x_continuous(breaks =c(0,6,12,18),labels=c(0,6,12,18))+theme(legend.position = "none")#+coord_trans(y="log2")
# plot_overtime_RIBBON
  # ,y=c(1,maxValue)


colorsss <- c("#0081a7ff","#fec3b7ff","#9f9f92ff","#6d6466ff","#4e3d42ff")

colorsss_01 <- c("#0081a7ff","#fec3b7ff")

plot_overtime_RIBBON <-  ggplot()+geom_ribbon(aes(x=timeCurated,ymin = twentyfifths , ymax = seventhts,group=interaction(sample,species),fill=species),data = group_growth_cfu_quantiles_spec_curated, alpha=.2)+theme_classic()+lims(x=c(minTime,18),y=c(minnsValue,maxValue))+scale_fill_manual(values=colorsss_01)+scale_color_manual(values=colorsss_01)+scale_x_continuous(breaks =c(0,6,12,18),labels=c(0,6,12,18))+theme(legend.position = "none")#+coord_trans(y="log2")
plot_overtime_RIBBON
  # ,y=c(1,maxValue)

# plot_overtime_RIBBON + plot_overtime_model+plot_layout(nrow = 2)
 plot_overtime_RIBBON + plot_overtime_model+(low_phased_ribbon+theme(legend.position = "none"))+plot_layout(nrow = 3)

svg("03_results/growth_curve_02.svg",width=6,height=9)
# 
 plot_overtime_RIBBON + (plot_overtime_model+theme(legend.position = "none"))+(low_phased_ribbon+theme(legend.position = "none"))+plot_layout(nrow = 3)

 
dev.off()

##--------------------------------
##stats
##compare final growth value
##--------------------------------
table(rmk202_strain_timeseries_Sheet1_long_cleaned_model$group)
table(rmk202_strain_timeseries_Sheet1_long_cleaned_model$Method)
table(rmk202_strain_timeseries_Sheet1_long_cleaned_model$groupingFinal)
table(rmk202_strain_timeseries_Sheet1_long_cleaned_model$time)


ldel_vergleich_final_01 <- rmk202_strain_timeseries_Sheet1_long_cleaned_model %>% filter(groupingFinal=="L. delbrueckii_lacto") %>% filter(time=="18")
ldel_vergleich_final_02 <- rmk202_strain_timeseries_Sheet1_long_cleaned_model %>% filter(groupingFinal %in% c("L. delbrueckii_pairwise","L. delbrueckii_rmk")) %>% filter(time=="18")

t.test(ldel_vergleich_final_01$CFU,ldel_vergleich_final_02$CFU)

###between Sterm and Ldel

Sterm_vergleich_final_01 <- rmk202_strain_timeseries_Sheet1_long_cleaned_model %>% filter(groupingFinal %in% c("S. thermophilus_pairwise","S. thermophilus_rmk")) %>% filter(time=="18")
# ldel_vergleich_final_02 <- rmk202_strain_timeseries_Sheet1_long_cleaned_model %>% filter(groupingFinal %in% c("L. delbrueckii_pairwise","L. delbrueckii_rmk")) %>% filter(time=="18")

t.test(Sterm_vergleich_final_01$CFU,ldel_vergleich_final_02$CFU)

1.4.3 Metabolomics

Here, I analyse the metabolomics data. I received from the GC/MS of Pascal an Yihelene on 13.8.2020.

X20200810_Vincent_cultures_profinder_untargeted <- read.xlsx("../data_zenodo/non_genomic_data/20200810_Vincent_cultures_profinder_untargeted.xlsx")
pca_res <- prcomp(df, scale. = TRUE)

Metabolites_information_reduced <- read_delim("../data_zenodo/non_genomic_data/Metabolites_information_reduced.csv",  "\t", escape_double = FALSE, trim_ws = TRUE)

nrow(Metabolites_information_reduced)
par(mfrow=c(1,1))
hist(Metabolites_information_reduced$`RT (avg)`)
Metabolites_information_reduced_goodOnes <- Metabolites_information_reduced %>% filter(`RT (avg)`<46.6)
nrow(Metabolites_information_reduced_goodOnes)
# colnames(X20200810_Vincent_cultures_profinder_untargeted)[1:4]
X20200810_Vincent_cultures_profinder_untargeted <-   X20200810_Vincent_cultures_profinder_untargeted %>% dplyr::select(c(colnames(X20200810_Vincent_cultures_profinder_untargeted)[1:4],Metabolites_information_reduced_goodOnes$compounds))
# Metabolites_information_reduced_goodOnes


colnames(X20200810_Vincent_cultures_profinder_untargeted)[1:3,1:10]
X20200810_Vincent_cultures_profinder_untargeted[1:3,1:10]
##------------------remove zero columns
X20200810_Vincent_cultures_profinder_untargeted_new <- X20200810_Vincent_cultures_profinder_untargeted[,!(colnames(X20200810_Vincent_cultures_profinder_untargeted)%in% names(which(colSums(X20200810_Vincent_cultures_profinder_untargeted[,5:ncol(X20200810_Vincent_cultures_profinder_untargeted)])==0)))]

dim(X20200810_Vincent_cultures_profinder_untargeted)
dim(X20200810_Vincent_cultures_profinder_untargeted_new)

# which(colSums(X20200810_Vincent_cultures_profinder_untargeted_new[,5:ncol(X20200810_Vincent_cultures_profinder_untargeted_new)])==0)
ploting_data


ploting_data <- metabolo.pca$x %>% as.data.frame()
ploting_data$sample <- X20200810_Vincent_cultures_profinder_untargeted$sample
ploting_data$biological_duplicate <- X20200810_Vincent_cultures_profinder_untargeted$biological_duplicate
ploting_data$analytical_duplicate <- X20200810_Vincent_cultures_profinder_untargeted$analytical_duplicate
ploting_data$grouping <- substring(X20200810_Vincent_cultures_profinder_untargeted$sample, 1, 1)
table(ploting_data$grouping)
ploting_data$grouping_long  <- plyr::revalue(ploting_data$grouping, c("A"="All strains combined","L"="L. delbrueckii only","S"="S. thermophilus only","R"="original RMK202 starter culture","M"="pairwise strain mix"))

# ##-----------------------
# ##----------------------NON-ADJUSTED SCALES
# ##-----------------------
metabolo.pca <- prcomp(X20200810_Vincent_cultures_profinder_untargeted_new[,5:ncol(X20200810_Vincent_cultures_profinder_untargeted_new)], center = FALSE,scale. = FALSE)



colorsss <- c("0081a7ff","fec3b7ff","9f9f92ff","6d6466ff","4e3d42ff")

unique(ploting_data$grouping_long)

colorsss <- c("#4e3d42ff","#fec3b7ff","#9f9f92ff","#6d6466ff","#0081a7ff")


pcaFINAL <- ggbiplot(metabolo.pca,var.axes=FALSE,groups=ploting_data$grouping_long,ellipse=TRUE,obs.scale = 1,size=2)+theme_classic()+scale_color_manual(values=colorsss)+scale_fill_manual(values=colorsss)
# pcaFINAL

svg("03_results/20200810_Vincent_cultures_pca_FINAL.svg",width=8,height=5)
pcaFINAL
dev.off()

1.5 Figure 5

Figure 5. CRISPR spacer diversity of L. delbrueckii and S. thermophilus. A) The correlation of fraction of shared CRISPR spacers and ANI of all L. delbrueckii and S. thermophilus with the corresponding densities and heatmaps on the x and y-axis. B) The heatmap of the genomic and CRISPR spacer diversities of all S.thermophilus illustrated with ANI (top heatmap; from white to red) and percent shared CRISPR spacers (bottom heatmap; from white to blue) C) The amount of metagenomic and genomic CRISPR spacers according to the five arrays.

Figure 5. CRISPR spacer diversity of L. delbrueckii and S. thermophilus. A) The correlation of fraction of shared CRISPR spacers and ANI of all L. delbrueckii and S. thermophilus with the corresponding densities and heatmaps on the x and y-axis. B) The heatmap of the genomic and CRISPR spacer diversities of all S.thermophilus illustrated with ANI (top heatmap; from white to red) and percent shared CRISPR spacers (bottom heatmap; from white to blue) C) The amount of metagenomic and genomic CRISPR spacers according to the five arrays.

1.5.1 ANI

FastANI was used to calculate all pairwise ANI values

##==============================
##Sterm

final_ANI <- read_delim("../data_zenodo/non_genomic_data//fastaANI_comparision_sterm.txt",  "\t", escape_double = FALSE, col_names = c("GenomeA","GenomeB","ANI","mappedFragemnts","totFragemnts"),  trim_ws = TRUE)
final_ANI$coverage_ANI <- 100*(final_ANI$mappedFragemnts/final_ANI$totFragemnts)



final_ANI$GenomeA <- str_remove(final_ANI$GenomeA,"/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Sterm/FNA_all/") %>% str_remove(.,".fna") %>% str_remove(.,".fasta")%>% str_remove(.,"L_I_202_")%>% str_remove(.,"S_O_202_")%>% str_remove(.,"S_I_202_")

final_ANI$GenomeB <- str_remove(final_ANI$GenomeB,"/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Sterm/FNA_all/") %>% str_remove(.,".fna") %>% str_remove(.,".fasta")%>% str_remove(.,"L_I_202_")%>% str_remove(.,"S_O_202_")%>% str_remove(.,"S_I_202_")


# final_ANI$GenomeB <- revalue(final_ANI$GenomeB, c("S50"="mst1","S72"="mst2"))
# final_ANI$GenomeA <- revalue(final_ANI$GenomeA, c("S50"="mst1","S72"="mst2"))

final_ANI$ANI <- round(final_ANI$ANI,digits = 2)

write.table(final_ANI,"../03_results/fastaANI_comparision_curated_sterm.txt",sep = "\t",quote = FALSE,row.names = FALSE,col.names = TRUE)

##==============================
##Ldel
##==============================

library(readr)
library(plyr)
library(tidyverse)
final_ANI <- read_delim("../data_zenodo/non_genomic_data//fastaANI_comparision_ldel.txt",  "\t", escape_double = FALSE, col_names = c("GenomeA","GenomeB","ANI","mappedFragemnts","totFragemnts"),  trim_ws = TRUE)
final_ANI$coverage_ANI <- 100*(final_ANI$mappedFragemnts/final_ANI$totFragemnts)



final_ANI$GenomeA <- str_remove(final_ANI$GenomeA,"/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Ldel/FNA_all/") %>% str_remove(.,".fna") %>% str_remove(.,".fasta")%>% str_remove(.,"L_I_202_")%>% str_remove(.,"S_O_202_")%>% str_remove(.,"S_I_202_")

final_ANI$GenomeB <- str_remove(final_ANI$GenomeB,"/archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Ldel/FNA_all/") %>% str_remove(.,".fna") %>% str_remove(.,".fasta")%>% str_remove(.,"L_I_202_")%>% str_remove(.,"S_O_202_")%>% str_remove(.,"S_I_202_")


# final_ANI$GenomeB <- revalue(final_ANI$GenomeB, c("S50"="mst1","S72"="mst2"))
# final_ANI$GenomeA <- revalue(final_ANI$GenomeA, c("S50"="mst1","S72"="mst2"))

final_ANI$ANI <- round(final_ANI$ANI,digits = 2)
final_ANI
write.table(final_ANI,"../03_results/fastaANI_comparision_curated_ldel.txt",sep = "\t",quote = FALSE,row.names = FALSE,col.names = TRUE)
##------------

1.5.2 shared CRISPRs

Here, we count how many and which spacers are in every Cluster from all different samples If a new sample (whole genome sequence) is added it needs to be added to the list… This is used for the heatmap of the CRISPR spacer identity.


species=sTERM
count=$(grep ">Cluster" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/${species}_results.txt.clstr)

grep ">" /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/${species}_results.txt |cut -d _ -f 1 |sed 's/>//g'|sort|uniq -c|wc -l

##----------prime
start_num=0
i=1
 mkdir -p /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count/
 
  echo -e "Clustername\tSMAG\tS50\tS72\t13491\t13492\t13493\t13494\t13495\t13496\t13497\t13498\t13499c1\t13499c2\t13500\t13499\t24737\t24738\t24739\t24740\t24853\t24854\t24855" >  /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//clusterCRISPRs.txt

  
##---------start analysis 
for i in $(seq 1 $count)
  do 
  echo $i
#  done
#start_num=0
#i=169
  sed -n "/>Cluster $start_num$/,/>Cluster $i$/p" /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/${species}_results.txt.clstr > /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt

REF_mst1=$(grep ">S50" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt)  
REF_mst2=$(grep ">S72" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt)  
REF_RMK202=$(grep ">SMAG" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
  
F13491=$(grep ">13491" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F13492=$(grep ">13492" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F13493=$(grep ">13493" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F13494=$(grep ">13494" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F13495=$(grep ">13495" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F13496=$(grep ">13496" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F13497=$(grep ">13497" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F13498=$(grep ">13498" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F13499c1=$(grep ">13499c1" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F13499c2=$(grep ">13499c2" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F13500=$(grep ">13500" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 

F13499_old=$(grep ">13499_" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F24737=$(grep ">24737" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F24738=$(grep ">24738" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F24740=$(grep ">24740" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F24739=$(grep ">24739" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F24853=$(grep ">24853" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F24854=$(grep ">24854" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F24855=$(grep ">24855" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 

 echo -e "Cluster_"${start_num}"\t"${REF_RMK202}"\t"${REF_mst1}"\t"${REF_mst2}"\t"${F13491}"\t"${F13492}"\t"${F13493}"\t"${F13494}"\t"${F13495}"\t"${F13496}"\t"${F13497}"\t"${F13498}"\t"${F13499c1}"\t"${F13499c2}"\t"${F13500}"\t"${F13499_old}"\t"${F24737}"\t"${F24738}"\t"${F24739}"\t"${F24740}"\t"${F24853}"\t"${F24854}"\t"${F24855} >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count///clusterCRISPRs.txt

  
  start_num=$((start_num+1))
  done

LDEL——————————-


species=Ldel
count=$(grep ">Cluster" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/${species}_results.txt.clstr)

grep ">" /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/${species}_results.txt |cut -d _ -f 1 |sed 's/>//g'|sort|uniq -c|wc -l

##----------prime
start_num=0
i=1
 mkdir -p /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count/
 
  #grep ">" /archiv/Projects/2019_pilotplant/02_annotation/CRISPR/mapping2phageGenome/CRISPR_alltogether_new_withOLDsamples_newNAMED_short |sed 's/>//g' | cut -d '_' -f 1 |sort| uniq

 #  echo -e "Clustername\tMAGRMK202\tmst1\tmst2\tF13491\tF13492\tF13493\tF13494\tF13495\tF13496\tF13497\tF13498\tF13499c1\tF13499c2\tF13500" > /archiv/Projects/2019_RMK202_analysis/04_CRISPRmappings/01_mapping2CRISPRregion/assembledSpacers/CD-hit_new/count//clusterCRISPRs.txt
# echo -e "Clustername\tSMAG\tS50\tS72\tF13491\tF13492\tF13493\tF13494\tF13495\tF13496\tF13497\tF13498\tF13499c1\tF13499c2\tF13500\t13499\t24737\t24738\t24739\t24740\t24853\t24854\t24855" >  /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//clusterCRISPRs.txt
  echo -e "Clustername\tLMAG\t11141\t11142\t11143\t12104\t12105\t12107\t12109\t24776\t24777\t13498\t13499c1\t13499c2\t13500\t13499\t24737\t24738\t24739\t24740\t24853\t24854\t24855" >  /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//clusterCRISPRs.txt

  
##---------start analysis 
for i in $(seq 1 $count)
  do 
  echo $i
#  done
#start_num=0
#i=169
  sed -n "/>Cluster $start_num$/,/>Cluster $i$/p" /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/${species}_results.txt.clstr > /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt

REF_RMK202=$(grep ">LMAG" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
REF_11141=$(grep ">11141" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt)  
F11142=$(grep ">11142" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F11143=$(grep ">11143" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F12104=$(grep ">12104" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F12105=$(grep ">12105" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F12107=$(grep ">12107" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F12109=$(grep ">12109" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F24776=$(grep ">24776" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F24777=$(grep ">24777" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F24778=$(grep ">24778" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 

F13499c2=$(grep ">13499c2" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F13500=$(grep ">13500" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F13499_old=$(grep ">13499_" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F24737=$(grep ">24737" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F24738=$(grep ">24738" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F24740=$(grep ">24740" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F24739=$(grep ">24739" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F24853=$(grep ">24853" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F24854=$(grep ">24854" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 
F24855=$(grep ">24855" -c /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count//tmpCRISPRS.txt) 

 echo -e "Cluster_"${start_num}"\t"${REF_RMK202}"\t"${REF_mst1}"\t"${REF_mst2}"\t"${F13491}"\t"${F13492}"\t"${F13493}"\t"${F13494}"\t"${F13495}"\t"${F13496}"\t"${F13497}"\t"${F13498}"\t"${F13499c1}"\t"${F13499c2}"\t"${F13500}"\t"${F13499_old}"\t"${F24737}"\t"${F24738}"\t"${F24739}"\t"${F24740}"\t"${F24853}"\t"${F24854}"\t"${F24855} >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CD-hit_spacers/${species}/count///clusterCRISPRs.txt

  
  start_num=$((start_num+1))
  done

Now I look at how many SPACERS ARE SHARED amongst strains

Now make HEATMAPS

here I combine the fastANI heatmap and the CRISPR spacer heatmap to make one big heatmap

##--------------------------
 #Sterm
##--------------------------
 
library(readr)
library(plyr)
library(tidyverse)
pairwiseSharedSpacers <- read_delim("../data_zenodo/non_genomic_data//clusterCRISPRs_curated_sterm.txt",  "\t", escape_double = FALSE, trim_ws = TRUE)
final_ANI <- read_delim("../data_zenodo/non_genomic_data//fastaANI_comparision_curated_sterm.txt",  "\t", escape_double = FALSE, trim_ws = TRUE)

unique(final_ANI$GenomeA)
unique(pairwiseSharedSpacers$Ref1)

unique(final_ANI$GenomeA) %in% unique(pairwiseSharedSpacers$Ref1)
unique(pairwiseSharedSpacers$Ref1)%in%unique(final_ANI$GenomeA)


##===================================================
##dENDROGRAMM
#this is to order them according to the phylogeny
##=================================================== 
##------------
##make matrix
##------------

names<- unique(final_ANI$GenomeA)

heatmap_prep_matrix <- matrix(NA,nrow=length(names),ncol = length(names))
rownames(heatmap_prep_matrix) <- names
colnames(heatmap_prep_matrix) <- names
# heatmap_prep_matrix <- heatmap_prep_matrix[!duplicated(heatmap_prep_matrix)]

final_ANI_02 <- final_ANI[!duplicated(final_ANI[,]), ]

for (rowsss in names){
  for (colsss in names){
  
heatmap_prep_matrix[rowsss,colsss]  <-  final_ANI_02[which(final_ANI_02$GenomeA==rowsss & final_ANI_02$GenomeB==colsss),"ANI"] %>% as.numeric
  
  
  }  
}

##------------
##make dendro of all snps sites
##------------
library(ggdendro)

otter.dendro <- as.dendrogram(hclust(d = dist(x = heatmap_prep_matrix)))

orderSterm <- labels(otter.dendro)

# Create dendro
dendro.plot <- ggdendrogram(data = otter.dendro, rotate = FALSE)
# Preview the plot
print(dendro.plot)

##===================================================
##FASTani heatmap
#this is the upper heatmap
##=================================================== 
final_ANI$GenomeA = factor(final_ANI$GenomeA, levels=orderSterm)
final_ANI$GenomeB = factor(final_ANI$GenomeB, levels=orderSterm)
levels(final_ANI$GenomeA)
levels(final_ANI$GenomeB)

  final_ANI$GenomeA = factor(final_ANI$GenomeA, levels=c("13492","13491","24854","24737","13500","13493","13498","SMAG","24853","13494","24855","S72","13499c1","13499","13499c2","24738","24740","S50", "13497", "24739","13495","13496"))
final_ANI$GenomeB = factor(final_ANI$GenomeB, levels=c("13492","13491","24854","24737","13500","13493","13498","SMAG","24853","13494","24855","S72","13499c1","13499","13499c2","24738","24740","S50", "13497", "24739","13495","13496"))


##3-----------------
#try to ouptu only top or bottom
##3-----------------


final_ANI_02 <- final_ANI
numberColumnsss <- 1
for (rowsss in levels(final_ANI_02$GenomeA)[-1]) {
  
  for (numberColumnsss_02 in 1:numberColumnsss) {
    
    Columnss <- levels(final_ANI_02$GenomeA)[numberColumnsss_02]
    
    final_ANI_02 <- final_ANI_02 %>% filter(!(GenomeA==rowsss&GenomeB==Columnss))
    
  }
  
  numberColumnsss=numberColumnsss+1
  
}

# densityANI <- ggplot(final_ANI_02,aes(x=ANI))+geom_density()+theme_classic()+lims(x=c(95,100))

ggheatmap_ANI <- ggplot(final_ANI_02, aes(GenomeA, GenomeB, fill = ANI))+
 geom_tile(color = "white",size=1.1)+
 scale_fill_gradient2(low = "grey", high = "red", midpoint=99,limit = c(99,100), space = "Lab",
  name="ANI") +
  scale_y_discrete(position = "right")+
  labs(legend="ANI",x="",y="")+
  # scale_fill_distiller(name = "SNPs", palette = "Blues", direction = -1)+
  # scale_fill_distiller(name = "ANI", palette = "Reds", direction = 1)+
  theme_minimal()+ # minimal theme
 theme(legend.position = "top",axis.text.y = element_text(size = 12),
      axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1))+
 coord_fixed()
# Print the heatmap
print(ggheatmap_ANI)


final_ANI_03 <- final_ANI_02 %>% filter(GenomeA!=GenomeB)

densityANI <- ggplot(final_ANI_03,aes(x=ANI))+geom_density(color="#EB4D4D",size=1.5)+theme_classic()+lims(x=c(99,100))
densityANI

##===================================================
##CRISPR ID
#this is the LOWER heatmap
##=================================================== 


library(viridis)

pairwiseSharedSpacers_tmp_02 <-  data.frame(Ref1=c(as.character(pairwiseSharedSpacers$Ref2),as.character(pairwiseSharedSpacers$Ref1)),Ref2=c(as.character(pairwiseSharedSpacers$Ref1),as.character(pairwiseSharedSpacers$Ref2)),PercentShared=c(as.numeric(pairwiseSharedSpacers$PercentShared),as.numeric(pairwiseSharedSpacers$PercentShared)))

pairwiseSharedSpacers_tmp_02$Ref1 = factor(pairwiseSharedSpacers_tmp_02$Ref1, levels=orderSterm)
pairwiseSharedSpacers_tmp_02$Ref2 = factor(pairwiseSharedSpacers_tmp_02$Ref2, levels=orderSterm)
levels(final_ANI$GenomeA)
levels(final_ANI$GenomeB)

pairwiseSharedSpacers_tmp_02$Ref2 = factor(pairwiseSharedSpacers_tmp_02$Ref2, levels=c("13492","13491","24854","24737","13500","13493","13498","SMAG","24853","13494","24855","S72","13499c1","13499","13499c2","24738","24740","S50", "13497", "24739","13495","13496"))
pairwiseSharedSpacers_tmp_02$Ref1 = factor(pairwiseSharedSpacers_tmp_02$Ref1, levels=c("13492","13491","24854","24737","13500","13493","13498","SMAG","24853","13494","24855","S72","13499c1","13499","13499c2","24738","24740","S50", "13497", "24739","13495","13496"))


##3-----------------
#try to ouptu only top or bottom
##3-----------------


pairwiseSharedSpacers_tmp_03 <- pairwiseSharedSpacers_tmp_02
numberColumnsss <- 1
for (rowsss in levels(pairwiseSharedSpacers_tmp_02$Ref1)[-1]) {
  
  for (numberColumnsss_02 in 1:numberColumnsss) {
    
    Columnss <- levels(pairwiseSharedSpacers_tmp_02$Ref1)[numberColumnsss_02]
    
    pairwiseSharedSpacers_tmp_03 <- pairwiseSharedSpacers_tmp_03 %>% filter(!(Ref1==rowsss&Ref2==Columnss))
    
  }
  
  numberColumnsss=numberColumnsss+1
  
}

ggheatmap_CRISPR <- ggplot(pairwiseSharedSpacers_tmp_03, aes(Ref2, Ref1, fill = PercentShared))+
 geom_tile(color = "white",size=1.1)+
  # theme_classic()+
 scale_fill_gradient2(low = "grey", high = "blue",
  midpoint = 25, limit = c(0,100), space = "Lab",
  name="shared CRISPR") +
    labs(legend="shared CRISPR",x="",y="")+
  theme_minimal()+ # minimal theme
 theme(legend.position = "top",axis.text.y = element_text(size = 12),
      axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1))+
 coord_fixed()
# Print the heatmap
print(ggheatmap_CRISPR)

pairwiseSharedSpacers_tmp_04 <- pairwiseSharedSpacers_tmp_03 %>% filter(Ref1!=Ref2)

densityCRISPR <- ggplot(pairwiseSharedSpacers_tmp_04,aes(x=PercentShared))+geom_density(color="#EB4D4D",size=1.5)+theme_classic()+lims(x=c(0,100))
densityCRISPR



library(patchwork)

##===================================================
##plot
##=================================================== 

  svg("../03_results/HEATMAP_ani_crispr_density.svg",width=20,height=12)

(densityANI+densityCRISPR) / (ggheatmap_ANI+ggheatmap_CRISPR)

 dev.off()

 
  svg("../03_results/HEATMAP_ani_crispr.svg",width=15,height=7.5)

ggheatmap_ANI+ggheatmap_CRISPR+dendro.plot

 dev.off()

 
  svg("../03_results/HEATMAP_ani_crispr_woLegend.svg",width=9,height=7.5)

(ggheatmap_ANI+theme(legend.position = "none"))+(ggheatmap_CRISPR+theme(legend.position = "none"))

 dev.off()
 
 ##-----------------------
 #bring together
 ##-----------------------
all_together_sterm <-  merge(pairwiseSharedSpacers_tmp_03,final_ANI_02,by.x=c("Ref1","Ref2"),by.y=c("GenomeA","GenomeB"))%>% add_column(species="S.thermophilus")

 ggplot(all_together_sterm,aes(x=PercentShared,y=coverage_ANI))+geom_point()+theme_classic()
##--------------------------
 #Ldel
##--------------------------

pairwiseSharedSpacers <- read_delim("../data_zenodo/non_genomic_data//clusterCRISPRs_curated_ldel.txt",  "\t", escape_double = FALSE, trim_ws = TRUE)
final_ANI <- read_delim("../data_zenodo/non_genomic_data//fastaANI_comparision_curated_ldel.txt",  "\t", escape_double = FALSE, trim_ws = TRUE)

unique(final_ANI$GenomeA)
unique(pairwiseSharedSpacers$Ref1)

unique(final_ANI$GenomeA) %in% unique(pairwiseSharedSpacers$Ref1)
unique(pairwiseSharedSpacers$Ref1)%in%unique(final_ANI$GenomeA)


##===================================================
##dENDROGRAMM
#this is to order them according to the phylogeny
##=================================================== 
##------------
##make matrix
##------------

names<- unique(final_ANI$GenomeA)

heatmap_prep_matrix <- matrix(NA,nrow=length(names),ncol = length(names))
rownames(heatmap_prep_matrix) <- names
colnames(heatmap_prep_matrix) <- names
# heatmap_prep_matrix <- heatmap_prep_matrix[!duplicated(heatmap_prep_matrix)]

final_ANI_02 <- final_ANI[!duplicated(final_ANI[,]), ]

for (rowsss in names){
  for (colsss in names){
  
heatmap_prep_matrix[rowsss,colsss]  <-  final_ANI_02[which(final_ANI_02$GenomeA==rowsss & final_ANI_02$GenomeB==colsss),"ANI"] %>% as.numeric
  
  
  }  
}

##------------
##make dendro of all snps sites
##------------
library(ggdendro)

otter.dendro <- as.dendrogram(hclust(d = dist(x = heatmap_prep_matrix)))

orderSterm <- labels(otter.dendro)

# Create dendro
dendro.plot <- ggdendrogram(data = otter.dendro, rotate = FALSE)
# Preview the plot
print(dendro.plot)

##===================================================
##FASTani heatmap
#this is the upper heatmap
##=================================================== 
final_ANI$GenomeA = factor(final_ANI$GenomeA, levels=orderSterm)
final_ANI$GenomeB = factor(final_ANI$GenomeB, levels=orderSterm)
levels(final_ANI$GenomeA)
levels(final_ANI$GenomeB)
##3-----------------
#try to ouptu only top or bottom
##3-----------------


final_ANI_02 <- final_ANI
numberColumnsss <- 1
for (rowsss in levels(final_ANI_02$GenomeA)[-1]) {
  
  for (numberColumnsss_02 in 1:numberColumnsss) {
    
    Columnss <- levels(final_ANI_02$GenomeA)[numberColumnsss_02]
    
    final_ANI_02 <- final_ANI_02 %>% filter(!(GenomeA==rowsss&GenomeB==Columnss))
    
  }
  
  numberColumnsss=numberColumnsss+1
  
}

 final_ANI_02 <- final_ANI_02 %>% filter(GenomeA!="11142") %>% filter(GenomeB!="11142")

ggheatmap_ANI <- ggplot(final_ANI_02, aes(GenomeA, GenomeB, fill = ANI))+
 geom_tile(color = "white",size=1.1)+
 scale_fill_gradient2(low = "grey", high = "red",
  midpoint = 99, limit = c(99,100), space = "Lab",
  name="ANI") +
  scale_y_discrete(position = "right")+
  labs(legend="ANI",x="",y="")+
  theme_minimal()+ # minimal theme
 theme(legend.position = "top",axis.text.y = element_text(size = 12),
      axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1))+
 coord_fixed()
# Print the heatmap
print(ggheatmap_ANI)



final_ANI_03 <- final_ANI_02 %>% filter(GenomeA!=GenomeB)

densityANI <- ggplot(final_ANI_03,aes(x=ANI))+geom_density(color="#10B552",size=1.5)+theme_classic()+lims(x=c(99,100))
densityANI


##===================================================
##CRISPR ID
#this is the LOWER heatmap
##=================================================== 


library(viridis)

pairwiseSharedSpacers_tmp_02 <-  data.frame(Ref1=c(as.character(pairwiseSharedSpacers$Ref2),as.character(pairwiseSharedSpacers$Ref1)),Ref2=c(as.character(pairwiseSharedSpacers$Ref1),as.character(pairwiseSharedSpacers$Ref2)),PercentShared=c(as.numeric(pairwiseSharedSpacers$PercentShared),as.numeric(pairwiseSharedSpacers$PercentShared)))

pairwiseSharedSpacers_tmp_02$Ref1 = factor(pairwiseSharedSpacers_tmp_02$Ref1, levels=orderSterm)
pairwiseSharedSpacers_tmp_02$Ref2 = factor(pairwiseSharedSpacers_tmp_02$Ref2, levels=orderSterm)
levels(final_ANI$GenomeA)
levels(final_ANI$GenomeB)


##3-----------------
#try to ouptu only top or bottom
##3-----------------


pairwiseSharedSpacers_tmp_03 <- pairwiseSharedSpacers_tmp_02
numberColumnsss <- 1
for (rowsss in levels(pairwiseSharedSpacers_tmp_02$Ref1)[-1]) {
  
  for (numberColumnsss_02 in 1:numberColumnsss) {
    
    Columnss <- levels(pairwiseSharedSpacers_tmp_02$Ref1)[numberColumnsss_02]
    
    pairwiseSharedSpacers_tmp_03 <- pairwiseSharedSpacers_tmp_03 %>% filter(!(Ref1==rowsss&Ref2==Columnss))
    
  }
  
  numberColumnsss=numberColumnsss+1
  
}
##3-----------------
#plot
##3-----------------
 ###the strain 11142 is strange because it was divergent from the others and no additional SNV was found for it in the metagenome. 
 #we assume there was a mistake in the strain collection
 
 # all_together_all$Ref1
 pairwiseSharedSpacers_tmp_03 <- pairwiseSharedSpacers_tmp_03 %>% filter(Ref1!="11142")%>% filter(Ref2!="11142")
 
ggheatmap_CRISPR <- ggplot(pairwiseSharedSpacers_tmp_03, aes(Ref2, Ref1, fill = PercentShared))+
 geom_tile(color = "white",size=1.1)+
  # theme_classic()+
 scale_fill_gradient2(low = "grey", high = "blue",
   limit = c(0,100), space = "Lab",
  name="shared CRISPR") +
  labs(legend="shared CRISPR",x="",y="")+
  theme_minimal()+ # minimal theme
 theme(legend.position = "top",axis.text.y = element_text(size = 12),
      axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1))+
 coord_fixed()
# Print the heatmap
print(ggheatmap_CRISPR)


pairwiseSharedSpacers_tmp_04 <- pairwiseSharedSpacers_tmp_03 %>% filter(Ref1!=Ref2)

densityCRISPR <- ggplot(pairwiseSharedSpacers_tmp_04,aes(x=PercentShared))+geom_density(color="#10B552",size=1.5)+theme_classic()+lims(x=c(0,100))
densityCRISPR



##===================================================
##plot
##=================================================== 

  svg("../03_results/HEATMAP_ani_crispr_density_ldel.svg",width=20,height=12)

(densityANI+densityCRISPR) / (ggheatmap_ANI+ggheatmap_CRISPR)

 dev.off()



 
  svg("../03_results/HEATMAP_ani_crispr_ldel.svg",width=15,height=7.5)

ggheatmap_ANI+ggheatmap_CRISPR+dendro.plot

 dev.off()

 
  svg("../03_resultsHEATMAP_ani_crispr_woLegend_ldel.svg",width=9,height=7.5)

(ggheatmap_ANI+theme(legend.position = "none"))+(ggheatmap_CRISPR+theme(legend.position = "none"))

 dev.off()
 ##-----------------------
 #bring together
 ##-----------------------
all_together_ldel <-  merge(pairwiseSharedSpacers_tmp_03,final_ANI_02,by.x=c("Ref1","Ref2"),by.y=c("GenomeA","GenomeB")) %>% add_column(species="L.delbrueckii")

 ggplot(all_together_ldel,aes(x=PercentShared,y=coverage_ANI))+geom_point()+theme_classic()
 
 colorsss <- c("#fec3b7ff","#0081a7")
 # colorsss_01 <- c("#0081a7ff","#fec3b7ff")
all_together_all <- rbind(all_together_sterm,all_together_ldel)
plots_correlation <- ggplot(all_together_all,aes(x=PercentShared,y=ANI,color=species,fill=species))+geom_point(alpha=0.6)+theme_classic()+scale_color_manual(values=colorsss)+scale_fill_manual(values=colorsss)+theme(legend.position="none")+labs(x="shared CRISPR","ANI")+lims(y=c(99,100))
# all_together_all$ANI
  plots_correlation
  
  all_together_all <- rbind(all_together_sterm,all_together_ldel)
plots_correlation <- ggplot(all_together_all,aes(x=ANI,y=PercentShared,color=species,fill=species))+geom_point(alpha=0.6)+theme_classic()+scale_color_manual(values=colorsss)+scale_fill_manual(values=colorsss)+theme(legend.position="none")+labs(x="shared CRISPR","ANI")+lims(x=c(99,100))
# all_together_all$ANI
  plots_correlation
  
    svg("03_results/correaltion_all.svg",width=3,height=3)

plots_correlation

 dev.off()
 
 ###the strain 11142 is strange because it was divergent from the others and no additional SNV was found for it in the metagenome. 
 #we assume there was a mistake in the strain collection
 
 # all_together_all$Ref1
 all_together_all <- all_together_all %>% filter(Ref1!="11142")%>% filter(Ref2!="11142")
 
  plots_correlation <- ggplot(all_together_all,aes(x=PercentShared,y=ANI,color=species,fill=species))+geom_point(alpha=0.6)+theme_classic()+scale_color_manual(values=colorsss)+scale_fill_manual(values=colorsss)+theme(legend.position="none")+labs(x="shared CRISPR","ANI")+lims(y=c(99,100))
# all_together_all$ANI
  plots_correlation
  
    svg("../03_results/correaltion_all.svg",width=3,height=3)

plots_correlation

 dev.off()

1.5.3 metagenomic/genomic spacers

library(readr)
library(ggplot2)
# uniqueSpacers_count_Ref_wOLDstrains <- read_delim("~/Desktop/Projects/2019_RMK202_analysis/04_CRISPR_spacer/ForDADA2/uniqueSpacers_count_Ref_wOLDstrains.txt","\t", escape_double = FALSE, trim_ws = TRUE)
uniqueSpacers_count_Ref_wOLDstrains <- read_delim("../data_zenodo/non_genomic_data//uniqueSpacers_count_Ref_wOLDstrains.txt","\t", escape_double = FALSE, trim_ws = TRUE)

uniqueSpacers_count_Ref_wOLDstrains$explained <- ifelse(uniqueSpacers_count_Ref_wOLDstrains$dadaSpacer>0&uniqueSpacers_count_Ref_wOLDstrains$Strains>0,"both Meta & strains",ifelse(
  uniqueSpacers_count_Ref_wOLDstrains$dadaSpacer>0&uniqueSpacers_count_Ref_wOLDstrains$Strains==0,"explained only by meta","explained only by strains"
))

# table(uniqueSpacers_count_Ref_wOLDstrains$explained)

##-------------plot explained
# uniqueSpacers_count_Ref_wOLDstrains$ARRAYINFO

colorsss <- rev(c("darkcyan","darkturquoise","lightblue"))
uniqueSpacers_count_Ref_wOLDstrains$explained = factor(uniqueSpacers_count_Ref_wOLDstrains$explained, levels=c("explained only by meta","both Meta & strains","explained only by strains"))
uniqueSpacers_count_Ref_wOLDstrains$ARRAYINFO <- revalue(uniqueSpacers_count_Ref_wOLDstrains$ARRAYINFO, c("a2"="CR5","a1"="CR4"))
# uniqueSpacers_count_Ref_wOLDstrains$explained <- revalue(uniqueSpacers_count_Ref_wOLDstrains$ARRAYINFO, c("a2"="CRISPR array 2","a1"="CRISPR array 1","a3"="CRISPR array 3"))
table(uniqueSpacers_count_Ref_wOLDstrains$explained,uniqueSpacers_count_Ref_wOLDstrains$ARRAYINFO)

# svg("~/Desktop/Projects/2019_RMK202_analysis/plot/spacerexplained.svg",width=4,height=3)
# ggplot(uniqueSpacers_count_Ref_wOLDstrains,aes(x=ARRAYINFO,group=explained,fill=explained))+geom_bar()+theme_classic()+scale_fill_manual(values=colorsss)+theme(axis.title.x = element_blank(),axis.text.x =element_text(angle = 45, hjust = 1,size=9) )+labs(y="spacer counts",fill="")+scale_y_continuous(breaks =c(0,39,50,100,142,158),labels=c(0,39,50,100,142,158))
#  dev.off()
 
 ##new and wider
svg("../03_results//spacerexplained_wide_Ldel.svg",width=4,height=3)
ggplot(uniqueSpacers_count_Ref_wOLDstrains,aes(x=ARRAYINFO,group=explained,fill=explained))+geom_bar()+theme_classic()+scale_fill_manual(values=colorsss)+theme(axis.title.x = element_blank(),axis.text.x =element_text(angle = 45, hjust = 1,size=9) )+labs(y="spacer counts",fill="")+scale_y_continuous(breaks =c(0,10,27,39,92,117,139,163),labels=c(0,10,27,39,92,117,139,163))
 dev.off()
 ##------------------------
 #what are the only strains CRISPRs
  ##------------------------
ONlyIsolateSPACERS <- uniqueSpacers_count_Ref_wOLDstrains %>% filter(explained=="explained only by strains")
 

merge(ONlyIsolateSPACERS,spacer_Infos_Sterm_final,by.x="ClusterINFO",by.y="ClusterName")
 
merge(ONlyIsolateSPACERS,uniqueSpacersss_extended,by.x="ClusterINFO",by.y="ClusterName")

 ##------------------------
 #include Sterm
  ##------------------------

uniqueSpacers_count_Ref_wOLDstrains_sterm <- read_delim("../data_zenodo/non_genomic_data//uniqueSpacers_count_Ref_wOLDstrains_both.txt","\t", escape_double = FALSE, trim_ws = TRUE)

uniqueSpacers_count_Ref_wOLDstrains_sterm$explained <- ifelse(uniqueSpacers_count_Ref_wOLDstrains_sterm$dadaSpacer>0&uniqueSpacers_count_Ref_wOLDstrains_sterm$Strains>0,"both Meta & strains",ifelse(
  uniqueSpacers_count_Ref_wOLDstrains_sterm$dadaSpacer>0&uniqueSpacers_count_Ref_wOLDstrains_sterm$Strains==0,"explained only by meta","explained only by strains"
))

# table(uniqueSpacers_count_Ref_wOLDstrains$explained)

##-------------plot explained
# uniqueSpacers_count_Ref_wOLDstrains$ARRAYINFO

colorsss <- rev(c("darkcyan","darkturquoise","lightblue"))
uniqueSpacers_count_Ref_wOLDstrains_sterm$explained = factor(uniqueSpacers_count_Ref_wOLDstrains_sterm$explained, levels=c("explained only by meta","both Meta & strains","explained only by strains"))
uniqueSpacers_count_Ref_wOLDstrains_sterm$ARRAYINFO <- revalue(uniqueSpacers_count_Ref_wOLDstrains_sterm$ARRAYINFO, c("a2"="CR2","a1"="CR1","a3"="CR3"))
# uniqueSpacers_count_Ref_wOLDstrains$explained <- revalue(uniqueSpacers_count_Ref_wOLDstrains$ARRAYINFO, c("a2"="CRISPR array 2","a1"="CRISPR array 1","a3"="CRISPR array 3"))
table(uniqueSpacers_count_Ref_wOLDstrains_sterm$explained,uniqueSpacers_count_Ref_wOLDstrains_sterm$ARRAYINFO)

 ##------------------------
 #merge
##------------------------

uniqueSpacers_count_Ref_wOLDstrains_merge <- rbind(uniqueSpacers_count_Ref_wOLDstrains_sterm,uniqueSpacers_count_Ref_wOLDstrains)

table(uniqueSpacers_count_Ref_wOLDstrains_merge$explained,uniqueSpacers_count_Ref_wOLDstrains_merge$ARRAYINFO)

ggplot(uniqueSpacers_count_Ref_wOLDstrains_merge,aes(x=ARRAYINFO,group=explained,fill=explained))+geom_bar()+theme_classic()+scale_fill_manual(values=colorsss)+theme(axis.title.x = element_blank(),axis.text.x =element_text(angle = 45, hjust = 1,size=9) )+labs(y="spacer counts",fill="")+scale_y_continuous(breaks =c(0,10,17,27,39,54,92,101,117,139,163),labels=c(0,10,17,27,39,54,92,101,117,139,163))
svg("../03_results//spacerexplained_wide_merged.svg",width=5,height=3)
ggplot(uniqueSpacers_count_Ref_wOLDstrains_merge,aes(x=ARRAYINFO,group=explained,fill=explained))+geom_bar()+theme_classic()+scale_fill_manual(values=colorsss)+theme(axis.title.x = element_blank(),axis.text.x =element_text(angle = 45, hjust = 1,size=9) )+labs(y="spacer counts",fill="")+scale_y_continuous(breaks =c(0,10,17,27,39,54,92,101,117,139,163),labels=c(0,10,17,27,39,54,92,101,117,139,163))
 dev.off()

1.6 Figure 6

Figure 6. Characteristics of the phages identified in the cheese starter cultures. A) Gene annotation of the two Streptococcus starter culture phages, RMK202_1 and RMK202_2, and the two closest relatives (illustrated in lighter colors). Protein similarity between genes are indicated in grey (80-95% identity) and black (95-100%). B) Relative abundance of bacteria and phages over all metagenomic samples based on genome read coverage. C) Fraction of Streptococcus genomes with an integrated phage as based on the read coverage of phage-bacteria spanning regions relative to the coverage of the S. thermophilus genome. D) Fraction of Streptococcus phages which show signs of integration as based on the read coverage of phage-bacteria spanning regions relative to the coverage of the Streptococcus phage genomes. E) The number of spacers mapping against the different phage types. F) The Streptococcus phage network with the protospacer containing phages colored or labeled according to phage type. G) The spacer abundance versus the protospacer abundance from all phage spacers. The database specific linear regression and distributions are indicated in the figure and the axis figures accordingly.

Figure 6. Characteristics of the phages identified in the cheese starter cultures. A) Gene annotation of the two Streptococcus starter culture phages, RMK202_1 and RMK202_2, and the two closest relatives (illustrated in lighter colors). Protein similarity between genes are indicated in grey (80-95% identity) and black (95-100%). B) Relative abundance of bacteria and phages over all metagenomic samples based on genome read coverage. C) Fraction of Streptococcus genomes with an integrated phage as based on the read coverage of phage-bacteria spanning regions relative to the coverage of the S. thermophilus genome. D) Fraction of Streptococcus phages which show signs of integration as based on the read coverage of phage-bacteria spanning regions relative to the coverage of the Streptococcus phage genomes. E) The number of spacers mapping against the different phage types. F) The Streptococcus phage network with the protospacer containing phages colored or labeled according to phage type. G) The spacer abundance versus the protospacer abundance from all phage spacers. The database specific linear regression and distributions are indicated in the figure and the axis figures accordingly.

1.6.1 Phage annotation

prepare for aligment blast

download hit table (txt) from online blast results

data(three_genes)
comparisons[[1]]$col <- apply_color_scheme(c(0.6, 0.4, 0.5), "grey")
comparisons
##-------------------------
##Streptococcus_phage_Sterm_1_vs_2
##-------------------------
library(readr)
Sterm_1_vs_2_cleaned <- read_delim("../data_zenodo/non_genomic_data//Sterm_1_vs_2_cleaned.txt",  "\t", escape_double = FALSE, col_names = c("query","subject","score"),  trim_ws = TRUE)

Sterm_1_vs_2_cleaned$start1 <- str_split_fixed(Sterm_1_vs_2_cleaned$query, fixed("-"), 4)[,2]%>% as.numeric()
Sterm_1_vs_2_cleaned$end1 <- str_split_fixed(Sterm_1_vs_2_cleaned$query, fixed("-"), 4)[,3]%>% as.numeric()

Sterm_1_vs_2_cleaned$start2 <- str_split_fixed(Sterm_1_vs_2_cleaned$subject, fixed("-"), 4)[,2]%>% as.numeric()
Sterm_1_vs_2_cleaned$end2 <- str_split_fixed(Sterm_1_vs_2_cleaned$subject, fixed("-"), 4)[,3]%>% as.numeric()

Sterm_1_vs_2_cleaned$col <- ifelse(Sterm_1_vs_2_cleaned$score<95,"grey66", ifelse(Sterm_1_vs_2_cleaned$score<99,"grey88","black"))

Sterm_1_vs_2_cleaned_final <- Sterm_1_vs_2_cleaned %>%  add_column(.,"direction"="1") %>% filter(score>80) %>% dplyr::select(start1,end1,start2,end2,direction,col) %>% as.comparison()

##-------------------------
##Streptococcus_phage_Sterm_2_vs_sw30
##-------------------------
library(readr)
Sterm_2_vs_sw30_cleaned <- read_delim("../data_zenodo/non_genomic_data//Sterm_2_vs_sw30_cleaned.txt",  "\t", escape_double = FALSE, col_names = c("query","subject","score"),  trim_ws = TRUE)

Sterm_2_vs_sw30_cleaned$start1 <- str_split_fixed(Sterm_2_vs_sw30_cleaned$query, fixed("-"), 4)[,2]%>% as.numeric()
Sterm_2_vs_sw30_cleaned$end1 <- str_split_fixed(Sterm_2_vs_sw30_cleaned$query, fixed("-"), 4)[,3]%>% as.numeric()

Sterm_2_vs_sw30_cleaned$start2 <- str_split_fixed(Sterm_2_vs_sw30_cleaned$subject, fixed("-"), 4)[,2]%>% as.numeric()
Sterm_2_vs_sw30_cleaned$end2 <- str_split_fixed(Sterm_2_vs_sw30_cleaned$subject, fixed("-"), 4)[,3]%>% as.numeric()

Sterm_2_vs_sw30_cleaned$col <- ifelse(Sterm_2_vs_sw30_cleaned$score<95,"grey66", ifelse(Sterm_2_vs_sw30_cleaned$score<99,"grey88","black"))

Sterm_2_vs_sw30_cleaned_final <- Sterm_2_vs_sw30_cleaned %>%  add_column(.,"direction"="1") %>% filter(score>80) %>% dplyr::select(start1,end1,start2,end2,direction,col) %>% as.comparison()


##-------------------------
##Streptococcus_phage_Sterm_9874_vs_1
##-------------------------
library(readr)
Sterm_9874_vs_1_cleaned <- read_delim("../data_zenodo/non_genomic_data//Sterm_9874_vs_1_cleaned.txt",  "\t", escape_double = FALSE, col_names = c("query","subject","score"),  trim_ws = TRUE)

Sterm_9874_vs_1_cleaned$start1 <- str_split_fixed(Sterm_9874_vs_1_cleaned$query, fixed("-"), 4)[,2] %>% as.numeric()
Sterm_9874_vs_1_cleaned$end1 <- str_split_fixed(Sterm_9874_vs_1_cleaned$query, fixed("-"), 4)[,3]%>% as.numeric()

Sterm_9874_vs_1_cleaned$start2 <- str_split_fixed(Sterm_9874_vs_1_cleaned$subject, fixed("-"), 4)[,2]%>% as.numeric()
Sterm_9874_vs_1_cleaned$end2 <- str_split_fixed(Sterm_9874_vs_1_cleaned$subject, fixed("-"), 4)[,3]%>% as.numeric()

# Sterm_9874_vs_1_cleaned$col <- ifelse(Sterm_9874_vs_1_cleaned$score>95,"black","grey")
Sterm_9874_vs_1_cleaned$col <- ifelse(Sterm_9874_vs_1_cleaned$score<95,"grey66", ifelse(Sterm_9874_vs_1_cleaned$score<99,"grey88","black"))


Sterm_9874_vs_1_cleaned_final <- Sterm_9874_vs_1_cleaned %>%  add_column(.,"direction"="1") %>% filter(score>80) %>% dplyr::select(start1,end1,start2,end2,direction,col) %>% as.comparison()

##-----------------
##bring togethere
##-----------------
comparisons_mine <- list(Sterm_9874_vs_1_cleaned_final,Sterm_1_vs_2_cleaned_final,Sterm_2_vs_sw30_cleaned_final)

I annotated the phages by blasting all the proteins to the blast DB. Then I manually transfered the information. Now I will create a fill for genoplotR.

cd /home/vincent/Desktop/Projects/2019_RMK202_analysis/00_FINAL/06_phage/startaligned/annotation/
echo -e "name\tstart\tend\tstrand\tcol\tfill\tgene_type" > Streptococcus_virus_9874_for_genoplotR.gff
awk -F " " '{OFS="\t"}{print $9 $10 $11 $12 $13 $14 $15,$4,$5,"1","black","arrows"}' Streptococcus_virus_9874_forR.gff | sed 's/product=.*;group=//g' |awk -F "\t" '{OFS="\t"}{print $1,$2,$3,$4,$5,$1,$6}' >> Streptococcus_virus_9874_for_genoplotR.gff

###--------------------

echo -e "name\tstart\tend\tstrand\tcol\tfill\tgene_type" > Streptococcus_phage_1_startAligned_Final_for_genoplotR.gff

awk -F " " '{OFS="\t"}{print $9 $10 $11 $12 $13 $14 $15,$4,$5,"1","black","arrows"}' Streptococcus_phage_1_startAligned_Final_forR.gff | sed 's/ID=.*;group=//g' |awk -F "\t" '{OFS="\t"}{print $1,$2,$3,$4,$5,$1,$6}' >> Streptococcus_phage_1_startAligned_Final_for_genoplotR.gff

###--------------------

echo -e "name\tstart\tend\tstrand\tcol\tfill\tgene_type" > Streptococcus_phage_2_startAligned_Final_for_genoplotR.gff

awk -F " " '{OFS="\t"}{print $9 $10 $11 $12 $13 $14 $15,$4,$5,"1","black","arrows"}' Streptococcus_phage_2_startAligned_Final_forR.gff | sed 's/ID=.*;group=//g' |awk -F "\t" '{OFS="\t"}{print $1,$2,$3,$4,$5,$1,$6}' >> Streptococcus_phage_2_startAligned_Final_for_genoplotR.gff
cat Streptococcus_phage_2_startAligned_Final_for_genoplotR.gff

###--------------------

echo -e "name\tstart\tend\tstrand\tcol\tfill\tgene_type" > Streptococcus_phage_SW30_for_genoplotR.gff

awk -F " " '{OFS="\t"}{print $9 $10 $11 $12 $13 $14 $15,$4,$5,"1","black","arrows"}' Streptococcus_phage_SW30_forR.gff | sed 's/product=.*;group=//g' |awk -F "\t" '{OFS="\t"}{print $1,$2,$3,$4,$5,$1,$6}' >> Streptococcus_phage_SW30_for_genoplotR.gff
library(genoPlotR)
library(plyr)
library(tidyverse)


##-------------------------
##Streptococcus_phage_SW30
##-------------------------
library(readr)
SW30 <- read_delim("../data_zenodo/non_genomic_data/Streptococcus_phage_SW30_for_genoplotR.gff", "\t", escape_double = FALSE, trim_ws = TRUE)  %>% as.data.frame() #%>% filter(name!="spacer") #%>% select(-fill)

table(SW30$fill)

SW30$fill <- revalue(SW30$fill, c("portalprotein"="red","largeterminasesubunit"="red","smallterminasesubunit"="red","majorcapsidprotein"="green","head-tailconnectorprotein"="blue","baseplatecomponentprotein"="yellow","tailprotein"="pink","majortailprotein"="pink","tape-measureprotein"="green","tailchaperoneprotein"="green","distaltailprotein"="yellow","hypotheticalprotein"="grey","holin"="green","lysin"="green","replication"="brown","antireceptorprotein"="blue","other"="grey88","cro-likeprotein"="violet","HNHendonuclease"="grey88","endonuclease"="grey88","Protease"="grey88"))

table(SW30$fill)
table(SW30$col)

dna_seg1_SW30 <- dna_seg(SW30)
dna_segs_SW30 <- list(dna_seg1_SW30)
plot_gene_map(dna_segs=dna_segs_SW30)
##-------------------------
##Streptococcus_phage_2
##-------------------------
library(readr)
rmk_2 <- read_delim("../data_zenodo/non_genomic_data/Streptococcus_phage_2_startAligned_Final_for_genoplotR.gff", "\t", escape_double = FALSE, trim_ws = TRUE)  %>% as.data.frame() #%>% filter(name!="spacer") #%>% select(-fill)

table(rmk_2$fill)

# rmk_2$fill <- revalue(rmk_2$fill, c("portalprotein"="red","largeterminasesubunit"="red","smallterminasesubunit"="red","majorcapsidprotein"="green","head-tailconnectorprotein"="blue","baseplatecomponentprotein"="yellow","tailprotein"="pink","majortailprotein"="pink","tape-measureprotein"="green","tailchaperoneprotein"="green","distaltailprotein"="yellow","hypotheticalprotein"="grey","holin"="green","lysin"="green","replication"="brown","antireceptorprotein"="blue","other"="grey88","cro-likeprotein"="violet","HNHendonuclease"="grey88","endonuclease"="grey88","Protease"="grey88","Acr-like"="gold","baseplateprotein"="violet","Head-closureprotein"="purple","Integrase"="cyan"))

rmk_2$fill <- revalue(rmk_2$fill, c("portalprotein"="red","largeterminasesubunit"="red","smallterminasesubunit"="red","majorcapsidprotein"="green","minorcapsidprotein"="green","head-tailconnectorprotein"="blue","baseplatecomponentprotein"="yellow","scaffoldingprotein"="pink","tailprotein"="pink","majortailprotein"="pink","tape-measureprotein"="green","tailchaperoneprotein"="green","distaltailprotein"="yellow","hypotheticalprotein"="grey","holin"="green","lysin"="green","tail-associatedlysin"="blue","replication"="brown","antireceptorprotein"="blue","other"="grey88","cro-likeprotein"="violet","HNHendonuclease"="grey88","endonuclease"="grey88","Protease"="grey88","tailcompletionprotein"="blue","Integrase"="cyan","Acr-like"="gold","baseplateprotein"="violet","Head-closureprotein"="blue","repressor"="cyan","regulator"="grey88","antirepressor"="cyan"))

table(rmk_2$fill)
table(rmk_2$col)

dna_seg1_rmk_2 <- dna_seg(rmk_2)
dna_segs_rmk_2 <- list(dna_seg1_rmk_2)
plot_gene_map(dna_segs=dna_segs_rmk_2)

##-------------------------
##Streptococcus_phage_987
##-------------------------
library(readr)
phage_9874 <- read_delim("../data_zenodo/non_genomic_data/Streptococcus_virus_9874_for_genoplotR.gff", "\t", escape_double = FALSE, trim_ws = TRUE)  %>% as.data.frame() #%>% filter(name!="spacer") #%>% select(-fill)

table(phage_9874$fill)

phage_9874$fill <- revalue(phage_9874$fill, c("portalprotein"="red","largeterminasesubunit"="red","smallterminasesubunit"="red","majorcapsidprotein"="green","minorcapsidprotein"="green","head-tailconnectorprotein"="blue","baseplatecomponentprotein"="yellow","scaffoldingprotein"="pink","tailprotein"="pink","majortailprotein"="pink","tape-measureprotein"="green","tailchaperoneprotein"="green","distaltailprotein"="yellow","hypotheticalprotein"="grey","holin"="green","lysin"="green","tail-associatedlysin"="blue","replication"="brown","antireceptorprotein"="blue","other"="grey88","cro-likeprotein"="violet","HNHendonuclease"="grey88","endonuclease"="grey88","Protease"="grey88","tailcompletionprotein"="blue","repressor"="cyan","regulator"="grey88","antirepressor"="cyan"))

table(phage_9874$fill)
table(phage_9874$col)

dna_seg1_9874 <- dna_seg(phage_9874)
dna_segs_9874 <- list(dna_seg1_9874)
plot_gene_map(dna_segs=dna_segs_9874)


##-------------------------
##Streptococcus_phage_2
##-------------------------
rmk_1 <- read_delim("../data_zenodo/non_genomic_data/Streptococcus_phage_1_startAligned_Final_for_genoplotR.gff", "\t", escape_double = FALSE, trim_ws = TRUE)  %>% as.data.frame() #%>% filter(name!="spacer") #%>% select(-fill)

table(rmk_1$fill)

# rmk_2$fill <- revalue(rmk_2$fill, c("portalprotein"="red","largeterminasesubunit"="red","smallterminasesubunit"="red","majorcapsidprotein"="green","head-tailconnectorprotein"="blue","baseplatecomponentprotein"="yellow","tailprotein"="pink","majortailprotein"="pink","tape-measureprotein"="green","tailchaperoneprotein"="green","distaltailprotein"="yellow","hypotheticalprotein"="grey","holin"="green","lysin"="green","replication"="brown","antireceptorprotein"="blue","other"="grey88","cro-likeprotein"="violet","HNHendonuclease"="grey88","endonuclease"="grey88","Protease"="grey88","Acr-like"="gold","baseplateprotein"="violet","Head-closureprotein"="purple","Integrase"="cyan"))

rmk_1$fill <- revalue(rmk_1$fill, c("portalprotein"="red","largeterminasesubunit"="red","smallterminasesubunit"="red","majorcapsidprotein"="green","minorcapsidprotein"="green","head-tailconnectorprotein"="blue","baseplatecomponentprotein"="yellow","scaffoldingprotein"="pink","tailprotein"="pink","majortailprotein"="pink","tape-measureprotein"="green","tailchaperoneprotein"="green","distaltailprotein"="yellow","hypotheticalprotein"="grey","holin"="green","lysin"="green","tail-associatedlysin"="blue","replication"="brown","antireceptorprotein"="blue","other"="grey88","cro-likeprotein"="violet","HNHendonuclease"="grey88","endonuclease"="grey88","Protease"="grey88","tailcompletionprotein"="blue","Integrase"="cyan","Acr-like"="gold","baseplateprotein"="violet","Head-closureprotein"="blue","repressor"="cyan","regulator"="grey88","antirepressor"="cyan"))

table(rmk_1$fill)
table(rmk_1$col)

dna_seg1_rmk_1 <- dna_seg(rmk_1)
dna_segs_rmk_1 <- list(dna_seg1_rmk_1)
plot_gene_map(dna_segs=dna_segs_rmk_1)

###--------------------
##merge
###--------------------

all_toegther_annotation <- list(dna_seg1_9874,dna_seg1_rmk_1,dna_seg1_rmk_2,dna_seg1_SW30)
plot_gene_map(dna_segs=all_toegther_annotation)

with comparision

1.6.2 contig quantification

Here, we quantify not only the bacteria but also the phage abundance by taking the mapping coverage and normalise with the total read abundance.

# source("/home/vincent/Desktop/ScriptRepository/R_functions-master/R_functions/g_legend.R") ##for plotting only legends

##===================================
#-------------file import


  read_count <- read_delim("../data_zenodo/non_genomic_data//all_2bacteria_and_phages_from_MAG.bed","\t", escape_double = FALSE, col_names = c("chr","start","end","count","length_mapped","geneLength","unknown","sample"),trim_ws = TRUE)

table(read_count$chr)

# read_count$species <- ifelse(read_count$chr=="L_del_phage_01","L_del_phage_01","S_term_phage_01")
  
  table(read_count$chr)
  
  read_count$geneCoverage  <- (read_count$count*600)/read_count$geneLength
  
  # ggplot(read_count,aes(y=geneCoverage,group=sort,color=sort,fill=sort))+geom_boxplot()+facet_grid(sample~species, scales="free")
  
  library(dplyr)
  
  
  # all_final <- read_count %>% 
  #   group_by(sample,chr) %>% 
  #   dplyr::summarize(median = median(geneCoverage)) 
  # 
   all_final <- read_count %>% 
    group_by(sample,chr) %>% 
    dplyr::summarize(median = median(geneCoverage)) %>% filter(chr %in% c("CP046131","CP046134","Lactobacillus_phage_1","Streptococcus_phage_1","Streptococcus_phage_2"))
  
  
    total_samples_sumTreatment <- aggregate(. ~sample, data=all_final[,c("sample","median")], sum, na.rm=TRUE)
  
  all_final$total_coverage <- total_samples_sumTreatment[match(all_final$sample,total_samples_sumTreatment$sample),"median"]
  all_final$percent_coverage <- 100*(all_final$median/all_final$total_coverage)
  # 
  # all_final$species <- as.factor(all_final$species)
  # 
  # levels(all_final$species)
  
  
  # all_final$species <- factor(all_final$species, levels=rev(c("S_thermophilus_RMK202","S_term_plasmid_01","S_term_phage_01","L_delbrueckii_RMK202","L_del_plasmid_01","L_plasmid_RMK202","L_del_plasmid_02","L_del_phage_01")))
  # all_final$species <- factor(all_final$species, levels=rev(c("S_thermophilus_RMK202","S_phage_RMK202","L_delbrueckii_RMK202","L_delbrueckii_plasmid_RMK202_01","L_plasmid_RMK202","L_phage_RMK202_01","L_phage_RMK202_02")))
  
  # all_final$sample <- as.factor(all_final$sample)
  
  table(all_final$sample)
  all_final <- all_final %>% filter(! sample %in% c("th_K2_8h","di_K2_6h"))
  all_final$sample <- factor(all_final$sample, levels=(c("lyo202_96","Lyo_202_2012","Konserve_202","Lyo_202_2014","RMK202","Versand_202","G1_6_18","G2_6_18","G3_6_18","G4_6_18","G5_6_18")))
  # bac_final_02$sample <- revalue(bac_final_02$sample, c("lyo202_96"="Lyo\n1996","Lyo_202_2012"="Lyo\n2012", "Lyo_202_2014"="Lyo\n2014", "Konserve_202"="working\nstock", "RMK202"="starter\nculture\n2012", "Versand_202"="starter\nculture\n2018","di_K2_6h"="cheesemaking\nday1","th_K2_8h"="cheesemaking\nday2"))
  # bac_final_02$sample <- factor(bac_final_02$sample, levels=(c("Lyo\n1996","Lyo\n2012","Lyo\n2014","working\nstock","starter\nculture\n2012","starter\nculture\n2018","cheesemaking\nday1","cheesemaking\nday2")))
  



# table(total_samples$phage) %>% length()
write.table(all_final,"../03_results//coverage_rmk202.tsv",sep = "\t",quote = FALSE,col.names = FALSE)

write.table(all_final,"../03_results//coverage_rmk202_n32.tsv",sep = "\t",quote = FALSE,col.names = FALSE)

 # all_colours <-c("#C5F6FA" ,"#6CF5A3" ,"#36E37B" ,"#10B552", "darkorange",  "#FAA0A0","#EB4D4D")
 all_colours <-c("#C5F6FA" ,"#6CF5A3" ,"#36E37B" ,"#10B552", "darkorange",  "#FAA0A0","#EB4D4D")


# mito_colours <- c("#AFBACC","#BE99AB") # sequential_hcl(5,palette="Grays")[c(4,3)]
# Sterm_colours <- c("#FCC2C2","#EB4D4D")# c("#FAA0A0","#EB4D4D") #c("#FF7D87","#E71D32")    
# Ldel_colours <- c("#6CF5A3","#10B552")  # c("#8CD211","#4C8400")   # c("#5AA700","#2D660A") #brewer.pal(9,name="YlGnBu")[c(4,6)] #sequential_hcl(5,palette="Purples 3")[c(3,2)]
# lactococcus_colours <- c("#5AAAFA","#4178BE")  #sequential_hcl(5,palette="Terrain 2")[c(3,2)]
# rest_colours <-  c("#C5F6FA","#99E9F2","#66D9E8","#3BC9DB") #brewer.pal(9,name="YlOrBr")[c(5,7,8,9)] #sequential_hcl(5,palette="BluGrn")[2:5]
# colours_phages_02 <- c(rest_colours,lactococcus_colours,Ldel_colours,Sterm_colours,mito_colours)



##----------------change name
library(plyr)
library(dplyr)
all_final$sample <- revalue(all_final$sample, c("lyo202_96"="Lyo 1996","Lyo_202_2012"="Lyo\n2012", "Lyo_202_2014"="Lyo\n2014", "Konserve_202"="working\nstock", "RMK202"="starter\nculture\n2012", "Versand_202"="starter\nculture\n2018"))
##----------------plot
# all_final <- all_final %>% filter(!sample %in% c("cheesemaking\nday1","cheesemaking\nday2"))

levels(all_final$chr)
# all_final$species <- revalue(all_final$species, c("lyo202_96"="Lyo\n1996","Lyo_202_2012"="Lyo\n2012", "Lyo_202_2014"="Lyo\n2014", "Konserve_202"="working\nstock", "RMK202"="starter\nculture\n2012", "Versand_202"="starter\nculture\n2018","di_K2_6h"="cheesemaking\nday1","th_K2_8h"="cheesemaking\nday2"))
#all_final$chr <- factor(all_final$chr, levels=c("L_del_phage_01" , "L_del_plasmid_02" ,"L_plasmid_RMK202", "L_del_plasmid_01","S_term_phage_01","S_term_plasmid_01","L_delbrueckii_RMK202","S_thermophilus_RMK202"))

all_final$chr <- factor(all_final$chr, levels=c("Lactobacillus_phage_2" , "Lactobacillus_phage_1" ,"CP046133", "CP046132","Streptococcus_phage_2","Streptococcus_phage_1","CP046135","CP046131","CP046134"))



# all_colours
all_colours_new <-  c("#36E37A","#C5F6FA","#6CF5A3","#36E37B","orange","darkorange","#FAA0A0", "#10B552","#EB4D4D")
  
all_colours_new <-  c("#dbece1","#a0cbd2","#6bf5a2","#66c264","#ffa300","#ff8a00","#ff5200", "#10B552","#EB4D4D")
  
all_colours_new <-  c("#a0cbd2","#ffa300","#ff8a00", "#10B552","#EB4D4D")

# c("#C5F6FA","#6CF5A3","#36E37B", "#10B552","darkorange","#FAA0A0","#EB4D4D")
# c("L_del_phage_01" , "L_del_plasmid_02" ,"L_plasmid_RMK202", "L_del_plasmid_01","S_term_phage_01","S_term_plasmid_01","L_delbrueckii_RMK202","S_thermophilus_RMK202")

PrelAbundance <-  ggplot( data = all_final,aes(y = percent_coverage, x = sample, group=interaction(chr),fill = chr))+ geom_bar( stat="identity")+
    labs("",
         x="",
         y="relative abundance")+
    theme_classic()+
   # scale_color_viridis(discrete=TRUE)+
   scale_fill_manual(values=all_colours_new)+
  # scale_fill_viridis(discrete=TRUE)+
    theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9),
        # axis.text.x = element_blank(),
          legend.position="right",
          #legend.justification=c(1,1), legend.position=c(1,1),
          legend.title = element_blank()
          )

  PrelAbundance
  svg("../03_results//relative_abundance.svg",width=4.5,height=3)
#    # png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 1000, height = 800,res=300)
# 
PrelAbundance
# 
dev.off()
# 
 # svg("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.svg",width=7,height=4)
# png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 2000, height = 1000,res=300)
# 
# PrelAbundance
# 
# dev.off()

##----------------amount phages


# all_final$genus <- ifelse(grepl("S_",all_final$species),"CP046131","Lactobacillus")

all_final_phage <- all_final %>% filter(chr %in% c("Lactobacillus_phage_1","Lactobacillus_phage_2","Streptococcus_phage_1","Streptococcus_phage_2"))

all_final_phage %>% 
  group_by(sample) %>% 
  dplyr::summarize(sum = sum(percent_coverage))  


all_final_phage$median_01 <- all_final_phage$median
all_final_phage %>% filter(chr=="Lactobacillus_phage_1") %>% group_by(chr) %>% 
   dplyr::summarize(mean = mean(median_01),sd=sd(median_01))


all_final_phage %>% 
  group_by(sample) %>% 
  dplyr::summarize(sum = sum(median)) %>%   dplyr::summarize(mean = mean(sum),sd=sd(sum))

all_final_phage %>% 
  group_by(sample) %>% 
  dplyr::summarize(sum = sum(percent_coverage)) %>%   dplyr::summarize(mean = mean(sum),sd=sd(sum))
# %>% 
#    group_by(chr) %>% 
#   dplyr::summarize(min = min(sum),
#             max = max(sum))  



  total_samples_phages <- aggregate(. ~sample, data=all_final_phage[,c("sample","median")], sum, na.rm=TRUE)

all_final_phage$total_coverage_phages <- total_samples_phages[match(all_final_phage$sample,total_samples_phages$sample),"median"]
all_final_phage$percent_coverage_phages <- 100*(all_final_phage$median/all_final_phage$total_coverage_phages)
# 

all_final_phage$genus <- ifelse(grepl("S",all_final_phage$chr),"Streptococcus_phages","Lactobacillus_phages")


all_final_phage %>% 
  group_by(sample,chr) %>% 
  dplyr::summarize(sum = sum(percent_coverage_phages))  

all_final_phage %>% 
  group_by(sample,chr) %>% 
  dplyr::summarize(sum = sum(percent_coverage_phages))  %>% filter(chr=="Lactobacillus_phage_1")

table(all_final_phage$chr)

# %>% 
#    group_by(chr) %>% 
#   dplyr::summarize(min = min(sum),
#             max = max(sum))  
all_final_phage %>% 
  group_by(sample,genus) %>% 
  dplyr::summarize(sum = sum(percent_coverage_phages))  

##----------------amount of Streptococci and Lactobacilli


all_final$genus <- ifelse(grepl("S_",all_final$species),"CP046131","Lactobacillus")

all_final_bacteria <- all_final %>% filter(chr %in% c("CP046131","CP046134"))

all_final_bacteria %>% 
  group_by(sample,chr) %>% 
  dplyr::summarize(sum = sum(percent_coverage))  %>% 
   group_by(chr) %>% 
  dplyr::summarize(min = min(sum),
            max = max(sum),median=median(sum))  

##----------------amount Ldelplasmid


# all_final$genus <- ifelse(grepl("S_",all_final$species),"CP046131","Lactobacillus")

all_final_plasmid <- all_final %>% filter(chr %in% c("CP046132"))

all_final_plasmid %>% 
  group_by(sample,chr) %>% 
  dplyr::summarize(sum = sum(percent_coverage))  %>% 
   group_by(chr) %>% 
  dplyr::summarize(min = min(sum),
            max = max(sum))  

##----------------amount of Lactobacillus delbrueckii RMK202


all_final_bacteria %>% 
  group_by(chr) %>% 
  dplyr::summarize(min = min(percent_coverage),
            max = max(percent_coverage))  


##----------------coverage of Lactobacillus delbrueckii RMK202
table(all_final_bacteria$species)

all_final_bacteria[which(all_final_bacteria$species=="CP046131"),]


##-------------------------------plasmid copy number


all_final_copyNUMBER <- all_final_all %>% filter(chr %in% c("CP046132","CP046131")) %>% select(sample,chr, median) %>% spread(., chr, median)
all_final_copyNUMBER$copyNUMBER <- 100*(all_final_copyNUMBER$CP046132/all_final_copyNUMBER$CP046131)

plasmidCopyNUMber <- ggplot(all_final_copyNUMBER,aes(x=sample,y=copyNUMBER))+geom_point()+theme_classic()+theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9))+coord_trans( y="log2")+labs(x="",y="plasmid copy number\n[%]")

plasmidCopyNUMber

svg("../03_results//plasmidCopyNUMBER.svg",width=4.5,height=3)
   # png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 1000, height = 800,res=300)

plasmidCopyNUMber

dev.off()

all_final_copyNUMBER %>% 
  dplyr::summarize(sum = mean(copyNUMBER),sd=sd(copyNUMBER)) 


mean(all_final_copyNUMBER$copyNUMBER)
sd(all_final_copyNUMBER$copyNUMBER)




##----------------copy number of phages
 
 all_final %>% select(c("sample","chr","median")) %>% spread(., chr, median)
 

all_final_copyNUMBER_phage <- all_final %>% select(c("sample","chr","median")) %>% filter(chr %in% c("CP046134","Streptococcus_phage_1","Streptococcus_phage_2")) %>% spread(., chr, median)
all_final_copyNUMBER_phage$Streptococcus_phages <- all_final_copyNUMBER_phage$Streptococcus_phage_2+all_final_copyNUMBER_phage$Streptococcus_phage_1

all_final_copyNUMBER_phage$copyNUMBER <- 100*(all_final_copyNUMBER_phage$Streptococcus_phages/all_final_copyNUMBER_phage$CP046134)

phageCopyNUMber <- ggplot(all_final_copyNUMBER_phage,aes(x=sample,y=copyNUMBER))+geom_point()+theme_classic()+theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9))+coord_trans( y="log2")+labs(x="",y="Sterm phage copy number\n[%]")

phageCopyNUMber

svg("../03_results//phageCopyNUMBER.svg",width=4.5,height=3)
   # png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 1000, height = 800,res=300)

phageCopyNUMber

dev.off()

all_final_copyNUMBER_phage %>% 
  dplyr::summarize(sum = mean(copyNUMBER),sd=sd(copyNUMBER)) 


mean(all_final_copyNUMBER_phage$copyNUMBER)
sd(all_final_copyNUMBER_phage$copyNUMBER)


min(all_final_copyNUMBER_phage$copyNUMBER/100)
max(all_final_copyNUMBER_phage$copyNUMBER/100)

##----------------copy number of lacto phages
 
 all_final %>% select(c("sample","chr","median")) %>% spread(., chr, median)
 

all_final_copyNUMBER_phage_LACTO <- all_final %>% select(c("sample","chr","median")) %>% filter(chr %in% c("CP046131","Lactobacillus_phage_1")) %>% spread(., chr, median) %>% filter(sample!="working\nstock")

all_final_copyNUMBER_phage_LACTO$copyNUMBER <- (all_final_copyNUMBER_phage_LACTO$Lactobacillus_phage_1/all_final_copyNUMBER_phage_LACTO$CP046131)

phage_lactoCopyNUMber <- ggplot(all_final_copyNUMBER_phage_LACTO,aes(x=sample,y=copyNUMBER))+geom_point()+theme_classic()+theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9))+coord_trans( y="log2")+labs(x="",y="Sterm phage copy number\n[%]")

phage_lactoCopyNUMber

mean(all_final_copyNUMBER_phage_LACTO$copyNUMBER)
sd(all_final_copyNUMBER_phage_LACTO$copyNUMBER)
svg("../03_results//phageCopyNUMBER_lacto.svg",width=4.5,height=3)
   # png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 1000, height = 800,res=300)

phage_lactoCopyNUMber

dev.off()

all_final_copyNUMBER_phage_LACTO %>% 
  dplyr::summarize(sum = mean(copyNUMBER),sd=sd(copyNUMBER)) 


mean(all_final_copyNUMBER_phage_LACTO$copyNUMBER)
sd(all_final_copyNUMBER_phage_LACTO$copyNUMBER)


min(all_final_copyNUMBER_phage_LACTO$copyNUMBER/100)
max(all_final_copyNUMBER_phage_LACTO$copyNUMBER/100)

mean(all_final_copyNUMBER_phage_LACTO$copyNUMBER/100)
sd(all_final_copyNUMBER_phage_LACTO$copyNUMBER/100)
##----------------------normalise with actual mapping percent--------------------------------

# library(readr)
# 
# mappings <- read_delim("~/Desktop/Projects/2019_Pilotplan/02_mapping2referenceDB/mappings_final.txt",  "\t", escape_double = FALSE, col_names = c("sample","type","mapping"),  trim_ws = TRUE) %>% filter(type=="onlyMeta") %>% select(add=-type)
# mappings$sample <- revalue(mappings$sample, c("L71"="mst4","lyo202_96"="Lyo 1996","Lyo_202_2012"="Lyo\n2012", "Lyo_202_2014"="Lyo\n2014", "Konserve_202"="working\nstock", "RMK202"="starter\nculture\n2012", "Versand_202"="starter\nculture\n2018","di_K2_6h"="cheesemaking\nday1","th_K2_8h"="cheesemaking\nday2"))
# table(all_final$sample)
# table(mappings$sample)
# 
# all_final_normalised <-  merge(all_final,mappings,by.x ="sample",by.y="sample" )
# table(all_final_normalised$sample)
# 
# revalue(all_final_normalised$chr,c()
# 
# table(all_final_normalised$sample)
# all_final_normalised$percent_coverage_normalised <- all_final_normalised$percent_coverage*((all_final_normalised$mapping/100))
# PrelAbundance <-  ggplot( data = all_final_normalised,aes(y = percent_coverage_normalised, x = sample, group=interaction(chr),fill = chr))+ geom_bar( stat="identity")+
#     labs("",
#          x="",
#          y="relative abundance")+
#     theme_classic()+
#   geom_hline(yintercept=100, linetype="dashed", color = "grey")+
#    # scale_color_viridis(discrete=TRUE)+
#    scale_fill_manual(values=all_colours_new)+
#   # scale_fill_viridis(discrete=TRUE)+
#     theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9),
#         # axis.text.x = element_blank(),
#           legend.position="right",
#           #legend.justification=c(1,1), legend.position=c(1,1),
#           legend.title = element_blank()
#           )
# 
#   PrelAbundance
#   svg("~/Desktop/mid_thesis/report/figures/20200430/chapter1/relative_abundance.svg",width=4.5,height=3)
#    # png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 1000, height = 800,res=300)
# 
# PrelAbundance
# 
# dev.off()
# # 

##===============================
##only Bacteria
##===============================

all_final_bacteria

  total_samples_sumTreatment <- aggregate(. ~sample, data=all_final_bacteria[,c("sample","median")], sum, na.rm=TRUE)

all_final_bacteria$total_coverage <- total_samples_sumTreatment[match(all_final_bacteria$sample,total_samples_sumTreatment$sample),"median"]
all_final_bacteria$percent_coverage <- 100*(all_final_bacteria$median/all_final_bacteria$total_coverage)



# all_final$chr <- factor(all_final$chr, levels=c("Lactobacillus_phage_2" , "Lactobacillus_phage_1" ,"CP046133", "CP046132","Streptococcus_phage_2","Streptococcus_phage_1","CP046135","CP046131","CP046134"))



# all_colours
# all_colours_new <-  c("#36E37A","#C5F6FA","#6CF5A3","#36E37B","orange","darkorange","#FAA0A0", "#10B552","#EB4D4D")
  
all_colours_new <-  c( "#10B552","#EB4D4D")
  
# c("#C5F6FA","#6CF5A3","#36E37B", "#10B552","darkorange","#FAA0A0","#EB4D4D")
# c("L_del_phage_01" , "L_del_plasmid_02" ,"L_plasmid_RMK202", "L_del_plasmid_01","S_term_phage_01","S_term_plasmid_01","L_delbrueckii_RMK202","S_thermophilus_RMK202")

PrelAbundance_bacteria <-  ggplot( data = all_final_bacteria,aes(y = percent_coverage, x = sample, group=interaction(chr),fill = chr))+ geom_bar( stat="identity")+
    labs("",
         x="",
         y="relative abundance")+
    theme_classic()+
   # scale_color_viridis(discrete=TRUE)+
   scale_fill_manual(values=all_colours_new)+
  # scale_fill_viridis(discrete=TRUE)+
    theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9),
        # axis.text.x = element_blank(),
          legend.position="right",
          #legend.justification=c(1,1), legend.position=c(1,1),
          legend.title = element_blank()
          )

library(patchwork)

  PrelAbundance_bacteria
  svg("../03_results//relative_abundance_all.svg",width=10,height=4.5)
#    # png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 1000, height = 800,res=300)
# 
(PrelAbundance+theme(legend.position = "none"))+(PrelAbundance_bacteria+theme(legend.position = "none"))+PrelAbundance
# 
dev.off()


###----------------------low copy plasmids


   all_final_allsamll <- read_count %>% 
    group_by(sample,chr) %>% 
    dplyr::summarize(median = median(geneCoverage)) %>% filter(chr %in% c("CP046131","CP046134","CP046132","CP046133","CP046135"))
  
   

all_final_copyNUMBER_phage_low <- all_final_allsamll %>% select(c("sample","chr","median")) %>% spread(., chr, median) %>% filter(sample!="working\nstock")

all_final_copyNUMBER_phage_low$copyNUMBER_2 <- (all_final_copyNUMBER_phage_low$CP046133/all_final_copyNUMBER_phage_low$CP046131)
all_final_copyNUMBER_phage_low$copyNUMBER_3 <- (all_final_copyNUMBER_phage_low$CP046135/all_final_copyNUMBER_phage_low$CP046134)
all_final_copyNUMBER_phage_low$copyNUMBER_1 <- (all_final_copyNUMBER_phage_low$CP046132/all_final_copyNUMBER_phage_low$CP046134)

all_final_copyNUMBER_phage_low %>% filter(sample=="RMK202")

# phage_lactoCopyNUMber <- ggplot(all_final_copyNUMBER_phage_LACTO,aes(x=sample,y=copyNUMBER))+geom_point()+theme_classic()+theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9))+coord_trans( y="log2")+labs(x="",y="Sterm phage copy number\n[%]")
# 
# phage_lactoCopyNUMber

# svg("~/Desktop/mid_thesis/report/figures/20200430/chapter1/20201006/supplement/phageCopyNUMBER_lacto.svg",width=4.5,height=3)
   # png("~/Desktop/Manuscripts/2019_RMK202/Figures/F1_relativeAbundance.png", width = 1000, height = 800,res=300)

# phage_lactoCopyNUMber

# dev.off()

mean(all_final_copyNUMBER_phage_low$copyNUMBER_2)
mean(all_final_copyNUMBER_phage_low$copyNUMBER_3)
sd(all_final_copyNUMBER_phage_low$copyNUMBER_2)
sd(all_final_copyNUMBER_phage_low$copyNUMBER_3)

mean(all_final_copyNUMBER_phage_low$copyNUMBER_1)
sd(all_final_copyNUMBER_phage_low$copyNUMBER_1)

all_final_copyNUMBER_phage_LACTO %>% 
  dplyr::summarize(sum = mean(copyNUMBER),sd=sd(copyNUMBER)) 


mean(all_final_copyNUMBER_phage_LACTO$copyNUMBER)
sd(all_final_copyNUMBER_phage_LACTO$copyNUMBER)


min(all_final_copyNUMBER_phage_LACTO$copyNUMBER/100)
max(all_final_copyNUMBER_phage_LACTO$copyNUMBER/100)

mean(all_final_copyNUMBER_phage_LACTO$copyNUMBER/100)
sd(all_final_copyNUMBER_phage_LACTO$copyNUMBER/100)

1.6.3 Identify active Prophage reads

I try to identify active prophage reads. I want to do this by looking for paired end reads that the mates map to different contigs. The necessary columns in the Sam-file are the following:

$7 RNEXT String *|=|[:rname:∧ *=][:rname:]* Reference name of the mate/next read $8 PNEXT Int [0, 231 − 1] Position of the mate/next read

  1. RNEXT: Reference sequence name of the primary alignment of the NEXT read in the template. For the last read, the next read is the first read in the template. If @SQ header lines are present, RNEXT (if not ‘’ or ‘=’) must be present in one of the SQ-SN tag. This field is set as ‘’ when the information is unavailable, and set as ‘=’ if RNEXT is identical RNAME. If not ‘=’ and the next read in the template has one primary mapping (see also bit 0x100 in FLAG), this field is identical to RNAME at the primary line of the next read. If RNEXT is ‘*’, no assumptions can be made on PNEXT and bit 0x20.

  2. PNEXT: 1-based Position of the primary alignment of the NEXT read in the template. Set as 0 when the information is unavailable. This field equals POS at the primary line of the next read. If PNEXT is 0, no assumptions can be made on RNEXT and bit 0x20.

Obviously, I need to first extract all reads that map to the phages.

first masked the CRISPR ARRAYS and append the lineages genomes to the MAGs

## mask genomes


species=Sterm
 rm ${BaseLocation}/CRISPRspacerBLAST/blast/all_Genomes_masked.fna
 for genomes in $(echo "202-SMAG 202-S50 202-S72")
do
echo $genomes

 grep "repeat" /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/${species}/${genomes}/PROKKA_*.gff |cut -f 1,4,5 > /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/${species}/${genomes}/CRISPRrepeat_${genomes}.bed


bedtools maskfasta -fi /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/Sterm/FNA_all/S_O_202_13496.fna  -bed /archiv/Projects/2019_RMK202_analysis/00_FINAL/03_annotation/PROKKA/${species}/${genomes}/CRISPRrepeat_${genomes}.bed -fo /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes/strains/${species}/renamedContigs/StartAligned/StartAligned//${genomes}_CRISPRmasked.fna 

cat /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes/strains/${species}/renamedContigs/StartAligned/StartAligned//${genomes}_CRISPRmasked.fna  >> ${BaseLocation}/CRISPRspacerBLAST/blast/all_Genomes_masked.fna
 
done

 

##change name


sed 's/CP046134/S_genome_lineage1/g' \
/archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/maskedReferenceMapping/CRISPRmasked_RMK202_MAG_assembly.fasta > \
/work/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_one_of_each_line/CRISPR_masked_genomes//For_all_lineages_one_genome.fasta

sed 's/contig_1/S_genome_lineage4/g' \
/work/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_one_of_each_line/CRISPR_masked_genomes//S_O_202_13496*.fna >> \
/work/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_one_of_each_line/CRISPR_masked_genomes//For_all_lineages_one_genome.fasta

sed 's/S_O_202_24740_c1/S_genome_lineage3/g' \
/work/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_one_of_each_line/CRISPR_masked_genomes//S_O_202_24740*.fna >> \
/work/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_one_of_each_line/CRISPR_masked_genomes//For_all_lineages_one_genome.fasta

sed 's/contig_1/S_genome_lineage2/g' \
/work/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_one_of_each_line/CRISPR_masked_genomes//S_O_202_13494*.fna >> \
/work/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_one_of_each_line/CRISPR_masked_genomes//For_all_lineages_one_genome.fasta

grep ">" /work/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_one_of_each_line/CRISPR_masked_genomes//For_all_lineages_one_genome.fasta




###================
##description file
###================
#samplesss=/archiv/Projects/2019_RMK202_analysis/01_log/names_extended.txt  ##the file with all sample names
samplesss=/archiv/Projects/2019_RMK202_analysis/01_log/onlyMeta_samples_withEvolution.txt


threads=37
logFilelocation=/archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/maskedReferenceMapping//01_log
BaseLocation=/work/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_one_of_each_line/maskedReferenceMapping/
Assembly=/work/Projects/2019_RMK202_analysis/03_mapping2Nanopre/02_againstpolished_one_of_each_line/CRISPR_masked_genomes//For_all_lineages_one_genome.fasta
names=G4_6_18

##--------------------------------------------------------------------------------------------------------------------------------------
##-----------make complete reference with CRISPR masked genomes
##--------------------------------------------------------------------------------------------------------------------------------------

rm -r /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/maskedReferenceMapping/
mkdir -p /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/maskedReferenceMapping/
sed 's/202-SMAG-1/CP046134/g' /archiv/Projects/2019_RMK202_analysis/00_FINAL/02_CRISPR/CRISPR_region_tree/Sterm/maskedGenomes/202-SMAG_CRISPRmasked_extended.fna > /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/maskedReferenceMapping/CRISPRmasked_RMK202_MAG_assembly.fasta
sed 's/202-LMAG-1/CP046131/g' /archiv/Projects/2019_RMK202_analysis/00_FINAL/01_genomes/strains/Ldel/renamedContigs/StartAligned/StartAligned//202-LMAG_CRISPRmasked_extended.fna >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/maskedReferenceMapping/CRISPRmasked_RMK202_MAG_assembly.fasta


cat /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/startaligned/Streptococcus_phage_1_startAligned_Final.fasta >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/maskedReferenceMapping/CRISPRmasked_RMK202_MAG_assembly.fasta 
cat /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/startaligned/Streptococcus_phage_2_startAligned_Final.fasta >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/maskedReferenceMapping/CRISPRmasked_RMK202_MAG_assembly.fasta 

for remainsss in $(grep ">" /archiv/Projects/2019_Nano_Meta/02_flye_Assembly/02_raw_demultiplexed/di_K2_6h/Polishing_FINAL/FINAL_assembly_withPlasmids_PGAP/RMK202_MAG_assembly.fasta |sed 's/>//g' |grep "CP046134" -v |grep -v "CP046131"|grep -v "Streptococcus_phage_")
do
samtools faidx /archiv/Projects/2019_Nano_Meta/02_flye_Assembly/02_raw_demultiplexed/di_K2_6h/Polishing_FINAL/FINAL_assembly_withPlasmids_PGAP/RMK202_MAG_assembly.fasta ${remainsss} >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/maskedReferenceMapping/CRISPRmasked_RMK202_MAG_assembly.fasta

done

###================
##description file
###================

~/apps/PilerCR/pilercr1.06/pilercr -in ${Assembly} \
  -out ${Assembly}_pilarTest -noinfo\
  -seq ${Assembly}_pilarTest.fasta

##============
##mapping to reference
##============
bwa index $Assembly
# name_folder=02_againstpolished_single_rmk202_final_kneaddata_withStrains 

mkdir -p ${logFilelocation}

num=1

for names in $(cut -f 1 ${samplesss})
  do
  echo ${num}"/16  :" ${names}
  num=$((num+1))
  rm -r ${BaseLocation}/${names}

  mkdir -p ${BaseLocation}/${names}/bwaMapping2DB/
  
  
  bwa mem -t ${threads} ${Assembly} \
    /archiv/Projects/2019_pilotplant/01_rawData/02_mergedfastq/kneaddata/${names}/${names}/${names}*neaddata_paired_1.fastq /archiv/Projects/2019_pilotplant/01_rawData/02_mergedfastq/kneaddata/${names}/${names}/${names}*neaddata_paired_2.fastq | samtools sort -@${threads} -O BAM -o ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted.bam - 

##-------------special for G4
names=G4_6_18
  bwa mem -t ${threads} ${Assembly} \
     /home/vincent/Projects/2020_StarterEvolution/01_data/20200929_Novogene/X204SC20090774-Z01-F001/trimm_Galore/gz/${names}/${names}-R1_val_1.fq /home/vincent/Projects/2020_StarterEvolution/01_data/20200929_Novogene/X204SC20090774-Z01-F001/trimm_Galore/gz/${names}/${names}-R2_val_2.fq | samtools sort -@${threads} -O BAM -o ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted.bam - 


samtools view ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted.bam |awk -F "\t" '{OFS="\t"}{if($7!="=")print $0}' > /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings.sam


samtools view -H ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted.bam >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/all_reads_2_mappings.sam
#rm ${BaseLocation}/${names}/bwaMapping2DB/${names}_rawIllumina_bwamem_sorted.bam &
   done 



mv /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/all_reads_2_mappings.sam /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/all_reads_2_mappings_all.sam

cat /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/*_reads_2_mappings.sam >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/all_reads_2_mappings_all.sam

samtools sort /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/all_reads_2_mappings_all.sam |samtools view -S -b - > /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/all_reads_2_mappings_all.bam 

##--------------------------------------------------------------------------------------------------------------------------------------
##-----------extract reads that map to two different contigs
##--------------------------------------------------------------------------------------------------------------------------------------



#mkdir -p /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/
  #for names in $(cat /archiv/Projects/2019_RMK202_analysis/01_log/names_extended.txt)
  for names in $(cut -f 1 ${samplesss})
do
echo "----------------------"
echo ${names}



##------include only reads with soft or hard clipping--> must lie on the inseration site
grep "phage" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings.sam |wc -l 
#grep "phage" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings.sam |awk -F "\t" '{OFS="\t"}{if($12=="NM:i:0"||$12=="NM:i:1")print $0}' |awk -F "\t" '{OFS="\t"}{if($6~"S"||$6~"H")print $0}' > /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam




##------inlcude only reads that mate is also soft or hard clipped (if both ends are--> potential CRISPR array)
grep "phage" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings.sam |awk -F "\t" '{OFS="\t"}{if($12=="NM:i:0"||$12=="NM:i:1")print $0}' |awk -F "\t" '{OFS="\t"}{if($6~"S"||$6~"H")print $0}'|awk -F "\t" '{OFS="\t"}{if($14~"S"||$14~"H")print $0}'  |grep "H[0-9]\{1,3\}M[0-9]\{1,3\}H" -v |grep "S[0-9]\{1,3\}M[0-9]\{1,3\}S" -v  > /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam


less  /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam | wc -l

done



##--------------------------------------------------------------------------------------------------------------------------------------
##-----------extract reads that map to two different contigs
##--------------------------------------------------------------------------------------------------------------------------------------



mkdir -p /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/
  #for names in $(cat /archiv/Projects/2019_RMK202_analysis/01_log/names_extended.txt)
  for names in $(cut -f 1 ${samplesss} )
do
echo ${names}
echo "-----------------"
echo "number of phage reads:"
grep -c "phage" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam
echo "S. phage and lactobacillus genome reads:"
grep "Streptococcus_phage" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam |grep -c "CP046131"
echo "S. phage and Streptococcus genome reads:"
#grep "Streptococcus_phage" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam |grep -c "CP046134"
grep "Streptococcus_phage" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam |grep -c "S_gen"
echo "L. phage and lactobacillus genome reads:"
grep "Lactobacillus_phage" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam |grep -c "CP046131"
echo "L. phageand Streptococcus genome reads:"
#grep "Lactobacillus_phage" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam |grep -c "CP046134"
grep "Lactobacillus_phage" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam |grep -c "S.gen"



#grep  "phage" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings.sam |cut -f 7 |sort |uniq -c
#grep  "phage" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings.sam |cut -f 3 |sort |uniq -c


echo "=================================================================="
done



##--------------------------------------------------------------------------------------------------------------------------------------
##-----------creat circos file for mapping 
##--------------------------------------------------------------------------------------------------------------------------------------

#awk -F "\t" -v samplesss="$names" '{OFS="\t"}{if($5>1 && $5<7) print $0}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam 


rm -r /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/
mkdir -p /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/
  #for names in $(cat /archiv/Projects/2019_RMK202_analysis/01_log/names_extended.txt)
  for names in $(cut -f 1 ${samplesss} )
do
echo ${names}
echo "-----------------"


#awk -F "\t" -v samplesss="$names" '{OFS="\t"}{if($3~"_phage_")print $3,$4,$4+length($10),$7,$8,$8+length($10),samplesss}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam  >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes.txt


#awk -F "\t" -v samplesss="$names" '{OFS="\t"}{if($7~"_phage_")print $7,$8,$8+length($10),$3,$4,$4+length($10),samplesss}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes.txt

##-------------------no multimapping
##I think mapq>15 is the cutoff for only single mappings
awk -F "\t" -v samplesss="$names" '{OFS="\t"}{if($3~"_phage_" && $5>15) print $3,$4,$4+length($10),$7,$8,$8+length($10),samplesss,$5}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam  >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes.txt


awk -F "\t" -v samplesss="$names" '{OFS="\t"}{if($7~"_phage_" && $5>15)print $7,$8,$8+length($10),$3,$4,$4+length($10),samplesss,$5}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes.txt


awk -F "\t" -v samplesss="$names" '{OFS="\t"}{if($3~"_phage_" && $5>15) print $0}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam  >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes.sam


awk -F "\t" -v samplesss="$names" '{OFS="\t"}{if($7~"_phage_" && $5>15)print $0}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes.sam



echo "=================================================================="
done
wc -l /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes.txt



awk -F "\t" '{OFS="\t"}{if($4!~"_phage_")print $0}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes.txt | awk -F "\t" '{OFS="\t"}{if($7!="")print $0}'> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned.txt
wc -l /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned.txt

cut -f 7 /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes.txt|sort|uniq -c
cut -f 7 /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned.txt|sort|uniq -c

cut -f 4 /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned.txt |sort|uniq -c

grep "S_gen" /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes.sam > /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_Sterm.sam


###--------------------------make bedfile for coverage

for names in $(cut -f 1 ${samplesss} )
do
echo ${names}
echo "-----------------"


awk -F "\t" -v samplesss="$names" '{OFS="\t"}{if($3~"_phage_")print $3,$4,$4+length($10),samplesss"\n"$7,$8,$8+length($10),samplesss}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam  >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes.bed


awk -F "\t" -v samplesss="$names" '{OFS="\t"}{if($7~"_phage_")print $7,$8,$8+length($10),$3,$4,$4+length($10),samplesss}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/${names}_reads_2_mappings_polished.sam >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes.txt



echo "=================================================================="
done

seq_length.py ${Assembly} |cut -f 1,3 |sort > /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/genome.bed
rm /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/*llGenomes_mapping_2_genomes_cleaned_*_coverage.bed

for names in $(cut -f 1 ${samplesss} )
do
echo ${names}
echo "-----------------"


awk  -F "\t" -v samplezzz="$names" '{OFS="\t"}{if($1~"Streptococcus_phage"&& $7==samplezzz) print $1,$2,$3,$7"\n"$4,$5,$6,$7}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned.txt |bedtools sort > /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned_StermPhages.bed

awk  -F "\t"  -v samplezzz="$names"  '{OFS="\t"}{if($1~"Streptococcus_phage"&& $7==samplezzz) print "Streptococcus_phage_1",$2,$3,$7"\n"$4,$5,$6,$7}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned.txt |bedtools sort > /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned_StermPhages_bothtoegther.bed


awk  -F "\t"  -v samplezzz="$names" '{OFS="\t"}{if($1=="Lactobacillus_phage_1"&& $7==samplezzz) print $1,$2,$3,$7"\n"$4,$5,$6,$7}' /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned.txt |bedtools sort > /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned_LdelPhages.bed

##----coverage

bedtools genomecov -i /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned_StermPhages.bed  -d -g /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/genome.bed  |awk  -v samplezzz="$names"  -F "\t" '{OFS="\t"}{if($3!=0) print $0,samplezzz}' >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned_StermPhages_coverage.bed

bedtools genomecov -i /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned_StermPhages_bothtoegther.bed  -d -g /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/genome.bed  |awk  -v samplezzz="$names"  -F "\t" '{OFS="\t"}{if($3!=0) print $0,samplezzz}' >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned_StermPhages_bothtoegther_coverage.bed

bedtools genomecov -i /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned_LdelPhages.bed  -d -g /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/genome.bed  |awk -v samplezzz="$names" -F "\t" '{OFS="\t"}{if($3!=0) print $0,samplezzz}' >> /archiv/Projects/2019_RMK202_analysis/00_FINAL/06_phage/activePhagemapping/mappingLocation/allGenomes_mapping_2_genomes_cleaned_LdelPhages_coverage.bed

echo "=================================================================="
done
library(readr)
library(tidyverse)
allGenomes_mapping_2_genomes <- read_delim("../data_zenodo/non_genomic_data//allGenomes_mapping_2_genomes_cleaned.txt", "\t", escape_double = FALSE, col_names = c("phageGenome","phageStart","phageEnd","SecondGenome","SecondStart","SecondEnd","sample","MAPQ"),trim_ws = TRUE) 


ggplot(allGenomes_mapping_2_genomes,aes(x=SecondStart,fill=phageGenome,color=phageGenome))+geom_histogram(alpha=0.5)+theme_classic()+facet_wrap(~SecondGenome+sample,scales = "free")

ggplot(allGenomes_mapping_2_genomes,aes(x=phageStart,fill=SecondGenome,color=SecondGenome))+geom_density(alpha=0.5)+theme_classic()+facet_wrap(~phageGenome,scales = "free")


allGenomes_mapping_2_genomes$SecondStart

ggplot(allGenomes_mapping_2_genomes,aes(x=SecondStart,fill=phageGenome,color=phageGenome))+geom_density(alpha=0.5)+theme_classic()+facet_wrap(~SecondGenome+phageGenome,scales = "free")


ggplot(allGenomes_mapping_2_genomes,aes(x=SecondStart,fill=phageGenome,color=phageGenome))+geom_histogram(alpha=0.5)+theme_classic()+facet_grid(vars(SecondGenome),vars(phageGenome),scales = "free")


ggplot(allGenomes_mapping_2_genomes,aes(x=phageStart,fill=phageGenome,color=phageGenome))+geom_histogram(alpha=0.5)+theme_classic()+facet_grid(vars(SecondGenome),vars(phageGenome),scales = "free")

##------------------------
###only streptococci
##------------------------
table(allGenomes_mapping_2_genomes$SecondGenome)
allGenomes_mapping_2_genomes_sterm <- allGenomes_mapping_2_genomes %>% filter(phageGenome %in% c("Streptococcus_phage_1","Streptococcus_phage_2")) %>% filter(SecondGenome!="CP046131") %>% filter(SecondGenome!="CP046132")#%>% filter(SecondGenome=="CP046134") #%>% filter(phageEnd-phageStart>100) %>% filter(SecondEnd-SecondStart>100)
table(allGenomes_mapping_2_genomes_sterm$SecondGenome)

allGenomes_mapping_2_genomes_sterm$phageDist <- allGenomes_mapping_2_genomes_sterm$phageEnd -allGenomes_mapping_2_genomes_sterm$phageStart

ggplot(allGenomes_mapping_2_genomes_sterm,aes(x=phageDist))+geom_histogram()+theme_classic()

# ggplot(allGenomes_mapping_2_genomes_sterm,aes(x=phageStart,fill=phageGenome,color=phageGenome))+geom_histogram(alpha=0.5)+theme_classic()+facet_grid(vars(SecondGenome),vars(phageGenome),scales = "free")


ggplot(allGenomes_mapping_2_genomes_sterm,aes(x=SecondStart,fill=phageGenome,color=phageGenome))+geom_histogram(alpha=0.5)+theme_classic()+facet_wrap(~SecondGenome+sample,scales = "free_y")

ggplot(allGenomes_mapping_2_genomes_sterm,aes(x=SecondStart,fill=phageGenome,color=phageGenome))+geom_histogram(alpha=0.5)+theme_classic()+facet_grid(vars(SecondGenome),vars(phageGenome),scales = "free")

##maybe different insertion location per phage and per lineage

ggplot(allGenomes_mapping_2_genomes_sterm,aes(x=SecondStart,fill=interaction(phageGenome,SecondGenome),color=interaction(phageGenome,SecondGenome)))+geom_histogram(alpha=0.5)+theme_classic()
ggplot(allGenomes_mapping_2_genomes_sterm,aes(x=SecondStart,fill=phageGenome,color=SecondGenome))+geom_histogram(alpha=0.5)+theme_classic()



##maybe some samples are weird

ggplot(allGenomes_mapping_2_genomes_sterm,aes(x=SecondStart,fill=phageGenome,color=SecondGenome))+geom_histogram(alpha=0.5)+theme_classic()+facet_wrap(~sample,scales = "free_y")
allGenomes_mapping_3_genomes_sterm <- allGenomes_mapping_2_genomes_sterm %>% filter(!sample %in% c("RMK202","Konserve_202","Versand_202")) 

p2 <- ggplot(allGenomes_mapping_3_genomes_sterm,aes(x=SecondStart,fill=SecondGenome,color=SecondGenome))+geom_histogram(alpha=0.5,bins=100)+theme_classic()+lims(x=c(0,2000000))+labs(x="genomic location")+theme(legend.title = element_blank())
p2


png("../03_results//mapping_location_phages_unique.png", width = 1900, height = 1200,res=300)

p2

dev.off()

p2 <- ggplot(allGenomes_mapping_3_genomes_sterm,aes(x=SecondStart,fill=phageGenome,color=SecondGenome))+geom_histogram(alpha=0.5,bins=100)+theme_classic()+lims(x=c(0,2000000))
library(patchwork)
p2 / att_plot

att_plot
 library(plotly)
 ggp <- ggplotly(p2)

ggplot(allGenomes_mapping_3_genomes_sterm,aes(x=SecondStart,y=phageStart,fill=phageGenome,color=phageGenome))+facet_grid(vars(SecondGenome),vars(phageGenome),scales = "free")+geom_point(alpha=0.5)+theme_classic()




integrationSite <- ggplot(allGenomes_mapping_3_genomes_sterm,aes(x=SecondStart,y=phageStart))+stat_density_2d(aes(fill = ..level..), geom = "polygon")+theme_classic()+facet_grid(vars(SecondGenome),vars(phageGenome),scales = "free")+labs(x="location of mate mapping on S. thermophilus genome",y="location of mate mapping on phage genome")+theme(legend.position = "none")
integrationSite

integrationSite <- ggplot(allGenomes_mapping_2_genomes_sterm,aes(x=SecondStart,y=phageStart))+stat_density_2d(aes(fill = ..level..), geom = "polygon")+theme_classic()+labs(x="location of phage integration\non S. thermophilus genome",y="location of phage integration\non phage genome")+theme(legend.position = "right")#+geom_hline(yintercept=c(7500,29000,35000))
integrationSite

###=====================================
   ##number and percent of read supporting the integration 
###=====================================
   
##---------------------------------newappraoch with coverage

allGenomes_mapping_2_genomes_coverage <- read_delim("../data_zenodo/non_genomic_data//allGenomes_mapping_2_genomes_cleaned_StermPhages_bothtoegther_coverage.bed", "\t", escape_double = FALSE, col_names = c("Genome","position","coverage","sample"),trim_ws = TRUE) 


# ggplot(allGenomes_mapping_2_genomes_coverage,aes(x=position,y=coverage,color=sample))+geom_bar(stat="identity",alpha=0.5)+theme_classic()+facet_wrap(~Genome+sample,scales = "free")
   

summary_mapping <- allGenomes_mapping_2_genomes_coverage %>%  group_by(interaction(sample,Genome)) %>% 
  dplyr::summarise(max=max(coverage)) 
 
summary_mapping$sample <- str_split_fixed(summary_mapping$`interaction(sample, Genome)`, fixed("."), 2)[,1]
summary_mapping$chr <- str_split_fixed(summary_mapping$`interaction(sample, Genome)`, fixed("."), 2)[,2]

summary_mapping <- summary_mapping %>% filter(chr=="CP046134") %>% select(c(-`interaction(sample, Genome)`,-"chr")) 

colnames(summary_mapping) <- plyr::revalue(colnames(summary_mapping), c("max"="phageCoverage"))

###-------------------------------coverage genome

  read_count <- read_delim("../data_zenodo/non_genomic_data//Coverage_bacteria_and_phages_from_MAG.bed","\t", escape_double = FALSE, col_names = c("chr","start","end","count","length_mapped","geneLength","unknown","sample"),trim_ws = TRUE)

table(read_count$chr)

# read_count$species <- ifelse(read_count$chr=="L_del_phage_01","L_del_phage_01","S_term_phage_01")
  
  table(read_count$chr)
  
  read_count$geneCoverage  <- (read_count$count*600)/read_count$geneLength
  
  # ggplot(read_count,aes(y=geneCoverage,group=sort,color=sort,fill=sort))+geom_boxplot()+facet_grid(sample~species, scales="free")
  
  library(dplyr)
  
  
  all_final <- read_count %>% 
    group_by(sample,chr) %>% 
    dplyr::summarize(BacteriaCoverage = median(geneCoverage)) %>% filter(chr %in% c("CP046134","Streptococcus_phage_1","Streptococcus_phage_2")) %>% spread(., chr, BacteriaCoverage)
  

  
final_phage <- merge(all_final,summary_mapping,by="sample") %>% filter(!sample %in% c("di_K2_6h","th_K2_8h"))
   
final_phage$percent_Prophage <- 100*(final_phage$phageCoverage/final_phage$CP046134)
final_phage$abundance_phage1 <- 100*(final_phage$Streptococcus_phage_1/final_phage$CP046134)
final_phage$abundance_phage2 <- 100*(final_phage$Streptococcus_phage_2/final_phage$CP046134)

final_phage$sample <- plyr::revalue(final_phage$sample, c("lyo202_96"="Lyo 1996","Lyo_202_2012"="Lyo 2012", "Lyo_202_2014"="Lyo 2014", "Konserve_202"="working stock", "RMK202"="starter culture 2012", "Versand_202"="starter culture 2018", "di_K2_6h"="cheesemaking day1","th_K2_8h"="cheesemaking day2","G1_6_18"="experiment_A","G2_6_18"="experiment_B","G3_6_18"="experiment_C","G4_6_18"="experiment_D","G5_6_18"="experiment_E"))

final_phage$sample = factor(final_phage$sample, levels=c("Lyo 1996","Lyo 2012","Lyo 2014","working stock","starter culture 2012","starter culture 2018","cheesemaking day1","cheesemaking day2","experiment_A","experiment_B","experiment_C","experiment_D","experiment_E"))

NUMpLOT <- ggplot(final_phage,aes(x=sample,percent_Prophage))+geom_bar(stat="identity")+theme_classic()+lims(y=c(0,15))+labs(x="",y="Percent of S. thermophilus\nwith  prophage")+theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9))+theme(axis.text.x = element_blank())
NUMpLOT


final_phage$percent_Prophage_of_phages <- 100*(final_phage$phageCoverage/(final_phage$Streptococcus_phage_2+final_phage$Streptococcus_phage_1))

NUMpLOT_phage <- ggplot(final_phage,aes(x=sample,percent_Prophage_of_phages))+geom_bar(stat="identity")+theme_classic()+lims(y=c(0,15))+labs(x="",y="Percent of S. phage\nwhich is inserted")+theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9))+theme(axis.text.x = element_blank())
NUMpLOT_phage

final_phage %>% filter(chr=="Lactobacillus_phage_1") %>% group_by(chr) %>% 
   dplyr::summarize(mean = mean(median_01),sd=sd(median_01))

min(final_phage$percent_Prophage)
max(final_phage$percent_Prophage)

min(final_phage$percent_Prophage_of_phages)
max(final_phage$percent_Prophage_of_phages)
##----------


final_phage_long_phage <- final_phage %>% gather(., species, percent,c("abundance_phage1","abundance_phage2"), factor_key=TRUE,na.rm = TRUE) 
# Biolog_long <- gather(Biolog_all_plates, well, intensity, A01:H12, factor_key=TRUE,na.rm = TRUE) 
# final_phage$abundance_phage_both <- final_phage$abundance_phage2+final_phage$abundance_phage1

ggplot(final_phage_long_phage,aes(x=sample,y=percent,fill=species))+geom_bar(stat="identity")+theme_classic()+labs(x="",y="Percent of S. thermophilus MAG\nwith putative prophage")+theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9))



##----------
##final plot
##----------
library(patchwork)
integrationSite+NUMpLOT +   plot_layout(nrow=2,heights = c(2, 1)) #

svg("../03_results/IntegrationProphage.svg",width=6.5,height=11.5)
integrationSite+NUMpLOT+NUMpLOT_phage +   plot_layout(nrow=3,heights = c(2, 1,1)) #
   dev.off()
   

# svg("~/Desktop/Projects/2019_RMK202_analysis/plot/IntegrationProphage.svg",width=6.5,height=5.5)
# NUMpLOT_phage+NUMpLOT +   plot_layout(nrow=2) #
#    dev.off()   

1.6.4 Spacer origin

###-----------------------
##grouping
###-----------------------
groupingRMK202Strains <- data.frame(strain=c("13492","13491","24854","24837","13500","13493","13498","SMAG","24853","13494","24855","S72","13499c1","13499","13499c2","24838","24840","S50","13497","24839","13495","13496"),group=c(rep("lineage 1",9),rep("lineage 2",3),rep("lineage 3",6),rep("lineage 4",4)),colors=c(rep("#0000FF",9),rep("#6699FF",3),rep("#99CCFF",6),rep("#00FFFF",4)))


write.table(groupingRMK202Strains, "../03_results//ReferenceGenomeGrouping.txt",na = "", quote = FALSE, sep = "\t",row.names = FALSE, col.names =FALSE)

#error with wrong number as genome labels e.g. 24839 instead of the right 24739
#ReferenceGenomeGrouping <- read_delim("~/Desktop/Projects/2019_RMK202_analysis/00_FINAL/log/ReferenceGenomeGrouping.txt<",  "\t", escape_double = FALSE, col_names = c("strain","lineage","colors"),  trim_ws = TRUE)

ReferenceGenomeGrouping <- read_delim("../data_zenodo/non_genomic_data//ReferenceGenomeGrouping2.txt",  "\t", escape_double = FALSE, col_names = c("strain","lineage","colors"),  trim_ws = TRUE)

##=========================
##new

##-------------spacer count DADA over metagenomic samples

dada_spacer_count <- read_delim("../data_zenodo/non_genomic_data/dada_spacer_count_sterm.txt", "\t", escape_double = FALSE, trim_ws = TRUE)


##=========================
##old
##the old analysis is pretty cool because I can distinguish between protospacer and spacer mapping

###-------------------------
##genome coverage
###-------------------------

  read_count <- read_delim("../data_zenodo/non_genomic_data/Coverage_bacteria_and_phages_from_MAG.bed","\t", escape_double = FALSE, col_names = c("chr","start","end","count","length_mapped","geneLength","unknown","sample"),trim_ws = TRUE)

  read_count$geneCoverage  <- (read_count$count*600)/read_count$geneLength
  
   all_final_strepto <- read_count %>% 
    group_by(sample,chr) %>% 
    dplyr::summarize(median = median(geneCoverage)) %>% filter(chr %in% c("CP046134")) %>% select(-chr)
  
 # %>% filter(chr %in% c("CP046131","CP046134","Lactobacillus_phage_1","Streptococcus_phage_1","Streptococcus_phage_2"))


##-----------------------------------------------
##-------------------------bwa spacer count normalised
##-----------------------------------------------
#CRISPR_spacer_coverage <- read_delim("~/Desktop/Projects/2019_RMK202_analysis/04_CRISPR_spacer/CRISPR_spacer_coverage.bed",   "\t", escape_double = FALSE, col_names = c("Name","startSpacer","endSpacer","ClusterName","numReads","basesCoverd","basesTotal","numcov","sample"),  trim_ws = TRUE)
#reads4normalization <- read_delim("~/Desktop/Projects/2019_RMK202_analysis/04_CRISPR_spacer/reads4normalization.txt", "\t", escape_double = FALSE, col_names = c("sample","readNumber","species"), trim_ws = TRUE) ;  reads4normalization <- reads4normalization[,-3]

CRISPR_spacer_coverage <- read_delim("../data_zenodo/non_genomic_data/CRISPR_spacer_coverage_sterm.bed",   "\t", escape_double = FALSE, col_names = c("Name","startSpacer","endSpacer","ClusterName","numReads","basesCoverd","basesTotal","numcov","sample"),  trim_ws = TRUE)
reads4normalization <- read_delim("../data_zenodo/non_genomic_data/reads4normalization.txt", "\t", escape_double = FALSE, col_names = c("sample","readNumber","species"), trim_ws = TRUE) ;  reads4normalization <- reads4normalization[,-3]
###----------------normalize with reads mapped to the CRISPR spacers
# CRISPR_spacer_coverage <- merge(CRISPR_spacer_coverage,reads4normalization,by="sample",all.x = TRUE)
# CRISPR_spacer_coverage$CPM <- 1000000*(CRISPR_spacer_coverage$numReads/CRISPR_spacer_coverage$readNumber)

CRISPR_spacer_coverage <- merge(CRISPR_spacer_coverage,all_final_strepto,by="sample",all.x = TRUE) %>% filter(sample!="G4_6_18")
CRISPR_spacer_coverage$CPM <- (CRISPR_spacer_coverage$numReads/CRISPR_spacer_coverage$median)


# table(CRISPR_spacer_coverage$sample)
# CRISPR_spacer_coverage %>% filter(!is.na(CPM)) %>% select(sample) %>% table()
CRISPR_spacer_coverage <- CRISPR_spacer_coverage  %>% filter(!is.na(CPM))


# CRISPR_spacer_coverage <- CRISPR_spacer_coverage[!duplicated(CRISPR_spacer_coverage[,c("sample","ClusterName")]), ]

bwaSPACERCount_wide <- CRISPR_spacer_coverage %>% dplyr::select(sample,Name,CPM) %>% spread(.,sample,CPM) #%>% rename(spacer=Name)
##-----------------------------------------------
##-------------------------bwa protospacer count normalised
##-----------------------------------------------
CRISPR_protospacer_coverage <- read_delim("./data_zenodo/non_genomic_data/CRISPR_spacer_coverage_protospacer_sterm.bed",   "\t", escape_double = FALSE, col_names = c("Name","startSpacer","endSpacer","ClusterName","numReads","basesCoverd","basesTotal","numcov","sample"),  trim_ws = TRUE)
# reads4normalization <- read_delim("~/Desktop/Projects/2019_RMK202_analysis/04_CRISPR_spacer/reads4normalization.txt", "\t", escape_double = FALSE, col_names = c("sample","readNumber","species"), trim_ws = TRUE) ;  reads4normalization <- reads4normalization[,-3]
###----------------normalize with reads mapped to the CRISPR spacers
# CRISPR_protospacer_coverage <- merge(CRISPR_protospacer_coverage,reads4normalization,by="sample",all.x = TRUE)
# CRISPR_protospacer_coverage$CPM <- 1000000*(CRISPR_protospacer_coverage$numReads/CRISPR_protospacer_coverage$readNumber)


CRISPR_protospacer_coverage <- merge(CRISPR_protospacer_coverage,all_final_strepto,by="sample",all.x = TRUE) %>% filter(sample!="G4_6_18")
CRISPR_protospacer_coverage$CPM <- (CRISPR_protospacer_coverage$numReads/CRISPR_protospacer_coverage$median)


table(CRISPR_protospacer_coverage$sample)
table(table(CRISPR_protospacer_coverage$ClusterName))
CRISPR_protospacer_coverage <- CRISPR_protospacer_coverage  %>% filter(!is.na(CPM))

# CRISPR_protospacer_coverage <- CRISPR_protospacer_coverage[!duplicated(CRISPR_protospacer_coverage[,c("sample","ClusterName")]), ]
# table(CRISPR_spacer_coverage_02$sample)


bwaPROTOSPACERCount_wide <- CRISPR_protospacer_coverage %>% dplyr::select(sample,Name,CPM) %>% spread(sample,CPM) #%>% rename(spacer=Name) #rename(Name="spacer")
# bwaPROTOSPACERCount_wide <- CRISPR_spacer_coverage %>% dplyr::select(sample,Name,CPM) %>% unique()%>% spread(.,Name,CPM) #%>% rename(spacer=Name) #rename(Name="spacer")
 # CRISPR_spacer_coverage %>% dplyr::select(sample,Name) %>% nrow()
 # CRISPR_spacer_coverage %>% dplyr::select(sample,Name) %>% unique() %>% nrow()

# CRISPR_spacer_coverage %>% dplyr::select(sample,ClusterName)
###===========================================
##spacers only in experiment
###===========================================
ncol(bwaSPACERCount_wide)

bwaSPACERCount_wide[which(rowSums(bwaSPACERCount_wide[,-c(1,grep("_6_18",colnames(bwaSPACERCount_wide)))])==0),]

dada_spacer_count

##the following spacers only have coverage in the experiment samples
onlyExperimentSpacers <- dada_spacer_count[which(rowSums(dada_spacer_count[,-c(1,grep("_6_18",colnames(dada_spacer_count)))])==0),]

onlyExperimentSpacers



# bwaPROTOSPACERCount_wide[which(rowSums(bwaPROTOSPACERCount_wide[,-c(1,grep("_6_18",colnames(bwaPROTOSPACERCount_wide)))])==0),]

##!!!!!!!!!!!!!!!sample file
  sampleDF <- metasample_colors
  
##!!!!!!!!!!!!!!!spacer file
##-------information array assignment
  
  metaspacer_info <- read_delim("../data_zenodo/non_genomic_data/uniqueSpacers_count_Ref_wOLDstrains_both.txt", "\t", escape_double = FALSE, trim_ws = TRUE)  
metaspacer_info$METAClusterName <- gsub(">","",metaspacer_info$METAClusterName)
 spacers_info <- read_delim("../data_zenodo/non_genomic_data/spacers_info.txt", "\t", escape_double = FALSE, trim_ws = TRUE)
##-------information spacer blast
# spacers_info_blast <- read_delim("~/Desktop/Projects/2019_RMK202_analysis/04_CRISPR_spacer/all_differentBlastHits.txt", "\t", escape_double = FALSE, trim_ws = TRUE,col_names = c("DB","spacer","assigned")) %>% spread(., DB, assigned)
spacers_info_blast <- read_delim("../data_zenodo/non_genomic_data//all_differentBlastHits.txt", "\t", escape_double = FALSE, trim_ws = TRUE,col_names = c("DB","spacer","assigned")) %>% spread(., DB, assigned)

##-------clustering vContact blast
# Streptococcus_phage_Cluster_assignment <- read_csv("~/Desktop/Projects/2019_PhageDB/Vcontact_with_meta_new//allPhages_vContact/Streptococcus_phage_Cluster_assignment.txt",col_types = cols(.default = "c")) %>% mutate(Size = as.double(Size), Quality = as.double(Quality)) %>% select(-c(Order,Family,Genus,Quality,Type.x))
Streptococcus_phage_Cluster_assignment <- read_csv("../data_zenodo/non_genomic_data/Streptococcus_phage_Cluster_assignment.txt",col_types = cols(.default = "c")) %>% mutate(Size = as.double(Size), Quality = as.double(Quality)) %>% dplyr::select(-c(Order,Family,Genus,Quality,Type.x))

# Streptococcus_phage_Cluster_assignment[grep("rmk",Streptococcus_phage_Cluster_assignment$Genome),"Genome"]

# Streptococcus_phage_Cluster_assignment %>% filter(!is.na(Type.y)) %>% select(VC) %>% table()
# sort(table(Streptococcus_phage_Cluster_assignment$VC),decreasing=T) %>% head(n=30)
# sort(table(Streptococcus_phage_Cluster_assignment$Type.y),decreasing=T)
##!!!!!!!!!!!!!!!spacer matrix FROM DADA
  # CRISPR_spacer_coverage_extended_wide <-  CRISPR_spacer_coverage %>% select(sample,Name,CPM) %>% spread(sample, CPM)
  
library(readr)
dada_spacer_count <- read_delim("../data_zenodo/non_genomic_data//dada_spacer_count_sterm.txt","\t", escape_double = FALSE, trim_ws = TRUE)

metaspacer_info_tmp <- metaspacer_info %>% select(METAClusterName,spacer)

dada_spacer_count <- merge(dada_spacer_count,metaspacer_info_tmp,by.x="spacer",by.y="spacer",all.x = TRUE) %>% select(-spacer) #%>% rename( spacer = METAClusterName) 
##-------information strains explained
# library(readr)
# uniqueSpacers_count_Ref <- read_delim("~/Desktop/Projects/2019_RMK202_analysis/04_CRISPR_spacer/ForDADA2/uniqueSpacers_count_Ref.txt","\t", escape_double = FALSE, trim_ws = TRUE)
# uniqueSpacers_count_Ref$Num_ref_explained <- (uniqueSpacers_count_Ref$REF_mst1>0)+(uniqueSpacers_count_Ref$REF_mst2>0)+(uniqueSpacers_count_Ref$REF_RMK202>0)
# uniqueSpacers_count_Ref$RefStrain_explained <- ifelse(uniqueSpacers_count_Ref$Num_ref_explained>1,"multiple Strains",ifelse(uniqueSpacers_count_Ref$REF_RMK202>0,"Meta_RMK202",ifelse(uniqueSpacers_count_Ref$REF_mst1>0,"mst 1",ifelse(uniqueSpacers_count_Ref$REF_mst2>0,"mst2","not explained")))) 
# table(uniqueSpacers_count_Ref$RefStrain_explained)
# uniqueSpacers_count_Ref_final <- uniqueSpacers_count_Ref %>% mutate(spacer = str_replace( spacerName, ">","")) %>% select(spacer,RefStrain_explained) 

#----new----

# unique(spacer_Infos_Sterm_final$sample)
spacer_Infos_Sterm_final <- read_delim("../data_zenodo/non_genomic_data/spacer_Infos_Sterm_final.txt",  "\t", escape_double = FALSE, col_names = c("ClusterName","numSpacers_in_CLUSTER","sample","array","spc","ARRAY","SPACER"),  col_types = cols(SPACER = col_number()),  trim_ws = TRUE)
# spacer_Infos_Sterm_final <- unique(spacer_Infos_Sterm_final)

# spacer_Infos_Sterm_final_tmp <- spacer_Infos_Sterm_final %>% select(ClusterName,numSpacers_in_CLUSTER)
spacer_Infos_Sterm_final_tmp <- spacer_Infos_Sterm_final %>% select(ClusterName,numSpacers_in_CLUSTER,SPACER)

spacer_Infos_Sterm_final_tmp <- spacer_Infos_Sterm_final_tmp[!duplicated(spacer_Infos_Sterm_final_tmp[-3]),]

# spacer_Infos_Sterm_final %>% filter(ClusterName=="Cluster_1")
# spacer_Infos_Sterm_final_tmp<- spacer_Infos_Sterm_final_tmp %>% filter(ClusterName=="Cluster_1")

spacer_Infos_Sterm_final_new <- spacer_Infos_Sterm_final  %>% select(c(ClusterName,sample)) %>%  table() %>% as.data.frame() %>% spread(sample,Freq)
spacer_Infos_Sterm_final_new <- merge(spacer_Infos_Sterm_final_new,spacer_Infos_Sterm_final_tmp,by="ClusterName") %>% unique()
# table(spacer_Infos_Sterm_final_new$numSpacers_in_CLUSTER)
# spacer_Infos_Sterm_final_new2 <- merge(spacer_Infos_Sterm_final_new,spacer_Infos_Sterm_final_tmp,by="ClusterName") %>% unique()
# sum(is.na(spacer_Infos_Sterm_final_new$SPACER))
# uniqueSpacers_count_Ref$RefStrain_explained <- ifelse(spacer_Infos_Sterm_final_new$numSpacers_in_CLUSTER>1,"multiple Strains",ifelse(uniqueSpacers_count_Ref$REF_RMK202>0,"Meta_RMK202",ifelse(uniqueSpacers_count_Ref$REF_mst1>0,"mst 1",ifelse(uniqueSpacers_count_Ref$REF_mst2>0,"mst2","not explained"))))

# range(table(spacer_Infos_Sterm_final$sample))

uniqueSpacers_count_Ref_final <- spacer_Infos_Sterm_final_new
nrow(spacer_Infos_Sterm_final_new)
##-------merge

#   spacers_final <- merge(spacers_info,spacers_info_blast,by="spacer",all=TRUE)
#   spacers_final$explainedBLAST <- ifelse(!is.na(spacers_final$localPHAGE),"localPHAGE",ifelse(!is.na(spacers_final$localBAC),"localBAC",ifelse(!is.na(spacers_final$phageDB),"phageDB",ifelse(!is.na(spacers_final$BactDB),"phageDB","No-match"))))
# # table(spacers_final$explainedBLAST)
#   spacers_final <- merge(spacers_final,uniqueSpacers_count_Ref_final,by="spacer",all=TRUE)
#   spacers_final <- merge(spacers_final,Streptococcus_phage_Cluster_assignment,by.x="phageDB",by.y="Genome",all=TRUE)
#   spacers_final <- merge(spacers_final,spacer_Infos_Sterm_final_new,by.x="phageDB",by.y="Genome",all=TRUE)

##-------merge new
# sort(table(spacers_info_blast$phageDB),decreasing=TRUE)
# sort(table(spacers_info_blast$localPHAGE),decreasing=TRUE)

    spacers_final <- merge(metaspacer_info,spacers_info_blast,by.x="METAClusterName",by.y="spacer",all=TRUE)
  spacers_final$explainedBLAST <- ifelse(!is.na(spacers_final$localPHAGE),"localPHAGE",ifelse(!is.na(spacers_final$localBAC),"localBAC",ifelse(!is.na(spacers_final$phageDB),"phageDB",ifelse(!is.na(spacers_final$BactDB),"BactDB","No-match"))))
  spacers_final[which(spacers_final$explainedBLAST=="No-match"),]
  
#spacers_final$explainedBLAST <- ifelse(!is.na(spacers_info_blast$localPHAGE),"localPHAGE",ifelse(!is.na(spacers_info_blast$localBAC),"localBAC",ifelse(!is.na(spacers_info_blast$phageDB),"phageDB",ifelse(!is.na(spacers_info_blast$BactDB),"BacDB","No-match"))))
table(spacers_final$explainedBLAST)

spacers_final[which(spacers_final$explainedBLAST=="BactDB"),]
sum(is.na(spacers_final$explainedBLAST))


 # table(spacers_final$VC)
  
  # dim(uniqueSpacers_count_Ref_final)
  # uniqueSpacers_count_Ref_final$numSpacers_in_CLUSTER==0
# # nrow(spacers_final)
# nrow(uniqueSpacers_count_Ref_final)
# sum(is.na(spacers_final$phageDB))
# spacers_final$phageDB

  spacers_final$phageDB <- revalue(spacers_final$phageDB,c("Lactococcus lactis phage BK5-T"="Lactococcus phage BK5-T","Streptococcus thermophilus bacteriophage 7201"="Streptococcus virus 7201","Streptococcus thermophilus bacteriophage Sfi19"="Streptococcus virus Sfi19","Streptococcus thermophilus temperate bacteriophage O1205"="Streptococcus virus O1205"))
  spacers_final <- merge(spacers_final,uniqueSpacers_count_Ref_final,by.x="ClusterINFO",by.y="ClusterName",all=TRUE)
   spacers_final <- merge(spacers_final,Streptococcus_phage_Cluster_assignment,by.x="phageDB",by.y="Genome",all.x=TRUE)
   
   spacers_final[which(is.na(spacers_final$VC)&!is.na(spacers_final$phageDB)),]
 #   
  # tmp <- spacers_final[which(is.na(spacers_final$VC)&!is.na(spacers_final$phageDB)),] %>% select(phageDB)
 #

  #  for (i in 1:nrow(tmp)) { print(grep(paste0(" ",tmp[i,"phageShort"],"$"),Streptococcus_phage_Cluster_assignment$Genome))}
  # 
  # i=9
  # tmp[i,"phageDB"]
  # Streptococcus_phage_Cluster_assignment[grep(paste0(" ",tmp[i,"phageShort"],"$"),Streptococcus_phage_Cluster_assignment$Genome),"Genome"]
  # 

  
  # grep(tmp$phageShort,spacers_final$phageDB)
  
  # tmp
   
      # Streptococcus_phage_Cluster_assignment[grep("7201$",Streptococcus_phage_Cluster_assignment$Genome),]

  # spacers_final <- merge(spacers_final,spacer_Infos_Sterm_final_new,by.x="ClusterINFO",by.y="ClusterName",all=TRUE)
  
##------------------add unique
spacers_final$RefStrain_explained <- ifelse(is.na(spacers_final$numSpacers_in_CLUSTER),"only Metagenome",ifelse(spacers_final$numSpacers_in_CLUSTER>1,"multiple Strains",ifelse(spacers_final$numSpacers_in_CLUSTER==1,"unique spacer","not explained")))
# table(spacers_final$RefStrain_explained )

# hist(spacers_final$numSpacers_in_CLUSTER)
##--------------quick analysis  
table(spacers_final$VC) %>% sort()
table(spacers_final$Type.y) %>% sort()
# table(spacers_final$Type.x) %>% sort()
Streptococcus_phage_Cluster_assignment %>% filter(VC=="151_0")
Streptococcus_phage_Cluster_assignment %>% filter(VC=="92_0")
Streptococcus_phage_Cluster_assignment %>% filter(VC=="385_0")
Streptococcus_phage_Cluster_assignment %>% filter(VC=="316_1")
Streptococcus_phage_Cluster_assignment %>% filter(VC=="251_1")

# Streptococcus_phage_Cluster_assignment[grep("Javan63",Streptococcus_phage_Cluster_assignment$Genome),]
spacers_final %>% filter(!is.na(phageDB)) %>% filter(is.na(VC))

  write.table(spacers_final, "../03_results//Clusters_spacers_final.txt",na = "", quote = FALSE, sep = "\t",row.names = FALSE, col.names =FALSE)


###-----------------------------------
##num spacer in cluster
 ###----------------------------------- 
  
nrow(spacers_final)

table(spacers_final$numSpacers_in_CLUSTER)
sum(is.na(spacers_final$numSpacers_in_CLUSTER))
table(spacers_final$SPACER)
spacers_final$ARRAYINFO

ggplot(spacers_final,aes(x=SPACER,fill=explainedBLAST,color=explainedBLAST))+geom_bar()+facet_wrap(~ARRAYINFO)+theme_classic()

DF <- spacers_final %>% select(explainedBLAST,SPACER,ARRAYINFO) %>% filter(!is.na(SPACER))


DF$explainedBLAST = factor(DF$explainedBLAST, levels=c("No-match" ,"BactDB","localBAC" ,  "phageDB","localPHAGE"))

colorsssss <- rev(c("darkcyan","darkturquoise","goldenrod3","yellow","lightgray"))
  

spacerlocation_explained <- ggplot(DF,aes(x = SPACER,  fill = explainedBLAST)) + 
  geom_bar(position = "fill") +labs(x="Spacer position",y="percent of spacers")+
  scale_y_continuous(labels = scales::percent)+facet_wrap(~ARRAYINFO,scales = "free")+theme_classic()+scale_fill_manual(values = colorsssss)+theme(legend.position = "none")
spacerlocation_explained



 svg("../03_results//spacerlocation_explained.svg",width=5,height=3)

spacerlocation_explained#   
   dev.off()

1.6.5 protospacer/spacer

library(ggplot2)
##---------------------------
###correlation bwa and dada
##---------------------------
# head(dada_spacer_count)
# dada_ggprep <- bwaPROTOSPACERCount_wide %>% gather(.,key="sample",value="DadaCount",-Name) %>% add_column(method="protospacer")
# bwaSPACERCount_ggprep <- bwaSPACERCount_wide %>% gather(.,key="sample",value="bwaCount",-Name) %>% add_column(method="spacer") 
# countComparison <- merge(dada_ggprep,bwaSPACERCount_ggprep,by.x=c("METAClusterName","sample"),by.y=c("Name","sample")) %>% select(-method.x,-method.y)
# ggplot(countComparison,aes(x=bwaCount,y=DadaCount))+geom_point()+theme_classic()

##---------------------------
###correlation protospacer and spacer
# ##---------------------------
bwaSPACERCount_ggprep <- bwaSPACERCount_wide %>% gather(.,key="sample",value="bwaCount",-Name) %>% add_column(method="bwa") 

# dada_ggprep <- dada_spacer_count %>% gather(.,key="sample",value="DadaCount",-spacer) %>% add_column(method="dada")
 bwaPROTOSPACERCount_ggprep <- bwaPROTOSPACERCount_wide %>% gather(.,key="sample",value="protospacer",-Name) %>% add_column(method="bwaProto")
countComparison <- merge(bwaPROTOSPACERCount_ggprep,bwaSPACERCount_ggprep,by=c("Name","sample")) %>% select(-method.x,-method.y)
countComparison_ggprep <- merge(countComparison,spacers_final,by.x="Name",by.y="METAClusterName",all=TRUE)
countComparison_ggprep$protospacer <- countComparison_ggprep$protospacer+0.1
countComparison_ggprep$bwaCount <- countComparison_ggprep$bwaCount+0.1

ggplot(countComparison_ggprep,aes(x=protospacer,y=bwaCount,color=RefStrain_explained))+geom_point()+theme_classic()+coord_trans(y="log2",x="log2")#+facet_wrap(~explainedBLAST)

ggplot(countComparison_ggprep,aes(x=protospacer,y=bwaCount,color=explainedBLAST))+geom_point()+theme_classic()+coord_trans(y="log2",x="log2")+ geom_smooth(method = "lm", fill = NA,se = TRUE)

ggplot(countComparison_ggprep,aes(x=protospacer,y=bwaCount,color=explainedBLAST))+geom_point()+theme_classic()+ geom_smooth(method = "lm", fill = NA,se = TRUE)

#+facet_wrap(~explainedBLAST)
# countComparison_ggprep$explainedBLAST

##---------------------------
#REMOVE low samples
##---------------------------
table(countComparison_ggprep$sample)
countComparison_ggprep_02 <- countComparison_ggprep %>% filter(!is.na(protospacer)) %>% filter(!is.na(bwaCount)) %>% filter(explainedBLAST %in% c("localPHAGE","phageDB")) %>% filter(!sample %in% c("di_K2_6h","th_K2_8h")) #%>% filter(protospacer>0.5)%>% filter(bwaCount>0.5)
table(countComparison_ggprep_02$explainedBLAST)


# countComparison_ggprep_02$explainedBLAST = factor(countComparison_ggprep_02$explainedBLAST, levels=c("localPHAGE" ))
countComparison_ggprep_02$explainedBLAST = factor(countComparison_ggprep_02$explainedBLAST, levels=c("localPHAGE" ,"phageDB"))

# colorsssss <- c("darkcyan","darkturquoise")
colorsssss <- c("red","darkturquoise")

# myplot <- ggplot(countComparison_ggprep_02,aes(x=protospacer,y=bwaCount,color=explainedBLAST))+geom_point()+theme_classic()+ geom_smooth(method = "lm", fill = NA)
#+facet_wrap(~explainedBLAST)+coord_trans(y="log2",x="log2")
#
# myplot
# my.formula <- bwaCount ~ protospacer
my.formula <- y ~ x
library(ggpmisc)
plot <- ggplot(countComparison_ggprep_02,aes(x=protospacer,y=bwaCount,color=explainedBLAST))+geom_point(alpha=0.5)+theme_classic()+
  # facet_wrap(~sample+localPHAGE)+
  scale_fill_manual(values = colorsssss)+
  scale_color_manual(values = colorsssss)+
labs(x="protospacer [copy number]",y="spacers  [copy number]")+
geom_point() +
scale_y_log10(breaks=c(0.1,0.15,0.3)) +
scale_x_log10(breaks=c(0.1,1,10,30)) +
geom_smooth(method="lm",se = FALSE)+
  stat_poly_eq(formula = my.formula,
               aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
               parse = TRUE,)
plot

  # svg("~/Desktop/Manuscripts/2019_RMK202/Figures/S05_rawReadMapping.svg",width=6,height=4)
    png("../03_results//spacer_protospacer_regression.png", width = 2800, height = 2500,res=300)


plot
 dev.off()

 ##--------------------
 ##histogramm
 
 plot_density_x <- ggplot(countComparison_ggprep_02,aes(x=protospacer,color=explainedBLAST,fill=explainedBLAST))+geom_density(alpha=0.5)+theme_classic()+
   facet_wrap(~explainedBLAST,scales="free_y",nrow=2)+
  scale_fill_manual(values = colorsssss)+
  scale_color_manual(values = colorsssss)+
labs(x="",y="density")+
   # scale_x_log10()  +
   scale_x_log10(breaks=c(1,10,30)) +
      # scale_x_log10(breaks=c(1,10,100,1000,10000,100000))  +
   theme(legend.position = "none",axis.ticks = element_blank(),axis.text = element_blank(),axis.title.x=element_blank())
plot_density_x 

# 
# countComparison_ggprep_03 <- countComparison_ggprep_02 %>% filter(explainedBLAST=="localPHAGE")
# 
# 
#  plot_density_x <- ggplot(countComparison_ggprep_03,aes(x=protospacer,color=explainedBLAST,fill=explainedBLAST))+geom_density(alpha=0.5)+theme_classic()+
#    # facet_wrap(~sample)+
#   scale_fill_manual(values = colorsssss)+
#   scale_color_manual(values = colorsssss)+
# labs(x="",y="density")+
#    # scale_x_log10()  +
#    scale_x_log10(breaks=c(1,10,30)) +
#       # scale_x_log10(breaks=c(1,10,100,1000,10000,100000))  +
#    theme(legend.position = "none",axis.ticks = element_blank(),axis.text = element_blank(),axis.title.x=element_blank())
# plot_density_x 

 plot_density_y <- ggplot(countComparison_ggprep_02,aes(x=bwaCount,color=explainedBLAST,fill=explainedBLAST))+geom_density(alpha=0.5)+theme_classic()+
  scale_fill_manual(values = colorsssss)+
  scale_color_manual(values = colorsssss)+
labs(x="",y="density")+scale_x_log10(breaks=c(0.15,0.3)) +theme(legend.position = "none",axis.ticks = element_blank(),axis.text = element_blank(),axis.title.y=element_blank())+coord_flip()
plot_density_y 





###---------------------------
##make mix plot
###---------------------------

library(ggpubr)
library(patchwork)


# (plot+theme(legend.position = "none"))+plot_density_y+plot_layout(widths =c(3,1))
(plot_density_x+plot_spacer()+plot_layout(widths = c(3,1)))/((plot+theme(legend.position = "none")+plot_density_y)+plot_layout(widths =c(3,1)))+plot_layout(heights = c(2,3))



  svg("../03_results//spacer_vs_proto_01.svg",width=8,height=6)
(plot+theme(legend.position = "none"))+plot_density_y+plot_layout(widths =c(3,1))
 dev.off()
 
   svg("../03_results//spacer_vs_proto_02.svg",width=6,height=8)
plot_density_x/(plot+theme(legend.position = "none"))+plot_layout(heights = c(1,3))
 dev.off()
 
 
  png("../03_results//spacer_vs_proto_03.png", width = 2000, height = 2000,res=300)

    svg("../03_results//spacer_vs_proto_03.svg",width=5.5,height=7)
(plot_density_x+plot_spacer()+plot_layout(widths = c(3,1)))/((plot+theme(legend.position = "none")+plot_density_y)+plot_layout(widths =c(3,1)))+plot_layout(heights = c(2,3))
 dev.off()

##--------------------
 ##ridgeplot
 ##--------------------
  countComparison_ggprep_03 <- countComparison_ggprep_02 %>% filter(explainedBLAST!="localPHAGE") %>% filter(sample!="Versand_202") %>% filter(sample!="RMK202")
 countComparison_ggprep_03 <- countComparison_ggprep_02 %>% filter(explainedBLAST=="localPHAGE") %>% filter(sample!="Versand_202") %>% filter(sample!="RMK202")
 countComparison_ggprep_03 <- countComparison_ggprep_02  %>% filter(sample!="Versand_202") %>% filter(sample!="RMK202")
  countComparison_ggprep_03 <- countComparison_ggprep_02  # %>% filter(explainedBLAST!="localPHAGE") 
# table(countComparison_ggprep_02$explainedBLAST)

 library(ggridges)

 ggridgplot <- ggplot(countComparison_ggprep_03, aes(x = protospacer, y = sample, fill = stat(x))) +
   facet_wrap(~localPHAGE,nrow=3,scales="free_y")+
   # lims(x=c(0,30))+
      scale_x_log10(breaks=c(0.1,1,10,30)) +
  geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
  scale_fill_viridis_c(name = "protospacers", option = "C") +
  labs(x = 'protospacers [copy number]',y="")+theme(legend.position = "none")
 
  ggridgplot
  
    # svg("~/Desktop/mid_thesis/report/figures/20200430/chapter1/20201006/supplement/Protospacer_distribution.svg",width=6,height=6)

 png("../03_results/spacer_protospacer_regression_boxplot.png", width = 2000, height = 2500,res=300)

ggridgplot
 dev.off()
 ##--------------------
 ##boxpot
 ##--------------------

 ggplot(countComparison_ggprep_02,aes(x=explainedBLAST,y=protospacer,color=explainedBLAST,fill=explainedBLAST))+geom_boxplot(alpha=0.5)

library(ggpubr)
library(patchwork)
 plot_box_x <- ggplot(countComparison_ggprep_02,aes(x=explainedBLAST,group=explainedBLAST,y=protospacer,color=explainedBLAST,fill=explainedBLAST))+geom_boxplot(alpha=0.5)+theme_classic()+
labs(x="",y="protospacers [cpm]")+scale_y_log10(breaks=c(1,10,100,1000,10000,100000))  +theme(legend.position = "none")+ stat_compare_means(method = "wilcox.test")
plot_box_x 

 plot_box_y <- ggplot(countComparison_ggprep_02,aes(x=explainedBLAST,group=explainedBLAST,y=bwaCount,color=explainedBLAST,fill=explainedBLAST))+geom_boxplot(alpha=0.5)+theme_classic()+
labs(x="",y="spacers [cpm]")+scale_y_log10(breaks=c(1,10,100,1000,10000,100000))  +theme(legend.position = "none")+ stat_compare_means(method = "wilcox.test")
plot_box_y 

plot_box_x+plot_box_y

 png("../03_results/spacer_protospacer_regression_boxplot.png", width = 2000, height = 2500,res=300)

plot_box_x+plot_box_y
 dev.off()

 
# legend_1 <- g_legend(p_mst1_sterm_woModi_clustered)
 
 library(gridExtra)
grid.arrange(p1,p2,p3,p4, ncol=2, heights=c(1,2))

  svg("../03_results//polishing_K1.svg",width=6,height=5)
grid.arrange(p1,p2,p3,p4, ncol=2, heights=c(2,3))
 dev.off()
 
library(gridExtra)
grid.arrange(p1,p2,p3,p4, ncol=2, heights=c(1,2))

  svg("../03_results//polishing_K1.svg",width=6,height=5)
grid.arrange(p1,p2,p3,p4, ncol=2, heights=c(2,3))
 dev.off()

 
 
 ##---------------------------
 #line plots
 countComparison_ggprep_02 <- countComparison_ggprep %>% filter(!is.na(protospacer)) %>% filter(!is.na(bwaCount)) %>% filter(explainedBLAST %in% c("localPHAGE","phageDB")) %>% filter(!sample %in% c("di_K2_6h","th_K2_8h")) %>%  filter(!is.na(localPHAGE)) #%>% filter(protospacer>0.5)%>% filter(bwaCount>0.5)
table(countComparison_ggprep_02$explainedBLAST)


countComparison_ggprep_02$explainedBLAST = factor(countComparison_ggprep_02$explainedBLAST, levels=c("localPHAGE" ,"phageDB"))

colorsssss <- c("darkcyan","darkturquoise")

# myplot <- ggplot(countComparison_ggprep_02,aes(x=protospacer,y=bwaCount,color=explainedBLAST))+geom_point()+theme_classic()+ geom_smooth(method = "lm", fill = NA)
#+facet_wrap(~explainedBLAST)+coord_trans(y="log2",x="log2")
#
# myplot
# my.formula <- bwaCount ~ protospacer
my.formula <- y ~ x
library(ggpmisc)
plot <- ggplot(countComparison_ggprep_02,aes(x=protospacer,y=bwaCount,color=explainedBLAST))+geom_point()+theme_classic()+
  facet_wrap(~sample+localPHAGE,ncol=3)+
  scale_fill_manual(values = colorsssss)+
  scale_color_manual(values = colorsssss)+
labs(x="protospacer [cpm]",y="spacers [cpm]")+
geom_point() +
scale_y_log10(breaks=c(1,10,100,1000,5000)) +
scale_x_log10(breaks=c(1,10,100,1000,10000,100000)) +
geom_smooth(method="lm")+
  stat_poly_eq(formula = my.formula, 
               aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), 
               parse = TRUE,)   
plot

  png("../03_results//spacer_vs_proto_04.png", width = 4000, height = 12000,res=300)

    # svg("~/Desktop/Projects/2019_RMK202_analysis/plot/spacer_vs_proto_03.svg",width=6,height=6)
plot

dev.off()



  p3 <- ggplot(countComparison_ggprep_02,aes(x=sample,y=protospacer,group=Name,color=localPHAGE,fill=localPHAGE))+ geom_line(size=0.5, alpha=1)+
    labs("",
         x="",
         y="protospacer [cpm]")+
    theme_classic()+
    scale_x_discrete( expand = c(0, 0)) +
    theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9),
          rect = element_rect(fill = "transparent"),
          legend.position="none"
          )

p3  


countComparison_ggprep_03 <- countComparison_ggprep_02 %>%  filter(!sample %in% c("Versand_202","RMK202"))

 p4 <- ggplot(countComparison_ggprep_03,aes(x=sample,y=protospacer,group=Name,color=localPHAGE,fill=localPHAGE))+ geom_line(size=0.5, alpha=1)+
    labs("",
         x="",
         y="protospacer [cpm]")+
    theme_classic()+
    scale_x_discrete( expand = c(0, 0)) +
    theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9),
          rect = element_rect(fill = "transparent"),
          legend.position="bottom"
          )

p4 


  png("~../03_results//spacer_vs_proto_05.png", width = 2500, height = 12000,res=300)

    # svg("~/Desktop/Projects/2019_RMK202_analysis/plot/spacer_vs_proto_03.svg",width=6,height=6)
p3 /p4

dev.off()



 ##---------------------------
 #line plots of viral DB
 ##---------------------------
 countComparison_ggprep_new <- countComparison_ggprep %>% filter(!is.na(protospacer)) %>% filter(!is.na(bwaCount)) %>% filter(explainedBLAST %in% c("localPHAGE","phageDB")) %>% filter(!sample %in% c("di_K2_6h","th_K2_8h")) %>%  filter(is.na(localPHAGE)) #%>% filter(protospacer>0.5)%>% filter(bwaCount>0.5)
table(countComparison_ggprep_new$explainedBLAST)


# countComparison_ggprep_new$explainedBLAST = factor(countComparison_ggprep_new$explainedBLAST, levels=c("localPHAGE" ,"phageDB"))

colorsssss <- c("darkcyan","darkturquoise")

# myplot <- ggplot(countComparison_ggprep_02,aes(x=protospacer,y=bwaCount,color=explainedBLAST))+geom_point()+theme_classic()+ geom_smooth(method = "lm", fill = NA)
#+facet_wrap(~explainedBLAST)+coord_trans(y="log2",x="log2")
#
# myplot
# my.formula <- bwaCount ~ protospacer
my.formula <- y ~ x
library(ggpmisc)
plot <- ggplot(countComparison_ggprep_new,aes(x=protospacer,y=bwaCount,color=explainedBLAST))+geom_point()+theme_classic()+
  facet_wrap(~sample+localPHAGE,ncol=3)+
  # scale_fill_manual(values = colorsssss)+
  # scale_color_manual(values = colorsssss)+
labs(x="protospacer [cpm]",y="spacers [cpm]")+
geom_point() +
scale_y_log10(breaks=c(1,10,100,1000,5000)) +
scale_x_log10(breaks=c(1,10,100,1000,10000,100000)) +
geom_smooth(method="lm")+
  stat_poly_eq(formula = my.formula, 
               aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), 
               parse = TRUE,)   
plot


 p4 <- ggplot(countComparison_ggprep_new,aes(x=sample,y=protospacer,group=Name))+ geom_line(size=0.5, alpha=1)+
    labs("",
         x="",
         y="protospacer [cpm]")+
    theme_classic()+
    scale_x_discrete( expand = c(0, 0)) +
    theme(axis.text.x = element_text(angle = 75, hjust = 1,size=9),
          rect = element_rect(fill = "transparent"),
          legend.position="bottom"
          )

p4